yjit_codegen.c

   1 // This file is a fragment of the yjit.o compilation unit. See yjit.c.
   2 #include "internal.h"
   3 #include "gc.h"
   4 #include "internal/compile.h"
   5 #include "internal/class.h"
   6 #include "internal/hash.h"
   7 #include "internal/object.h"
   8 #include "internal/sanitizers.h"
   9 #include "internal/string.h"
  10 #include "internal/struct.h"
  11 #include "internal/variable.h"
  12 #include "internal/re.h"
  13 #include "probes.h"
  14 #include "probes_helper.h"
  15 #include "yjit.h"
  16 #include "yjit_iface.h"
  17 #include "yjit_core.h"
  18 #include "yjit_codegen.h"
  19 #include "yjit_asm.h"
  20
  21 // Map from YARV opcodes to code generation functions
  22 static codegen_fn gen_fns[VM_INSTRUCTION_SIZE] = { NULL };
  23
  24 // Map from method entries to code generation functions
  25 static st_table *yjit_method_codegen_table = NULL;
  26
  27 // Code for exiting back to the interpreter from the leave instruction
  28 static void *leave_exit_code;
  29
  30 // Code for full logic of returning from C method and exiting to the interpreter
  31 static uint32_t outline_full_cfunc_return_pos;
  32
  33 // For implementing global code invalidation
  34 struct codepage_patch {
  35     uint32_t inline_patch_pos;
  36     uint32_t outlined_target_pos;
  37 };
  38
  39 typedef rb_darray(struct codepage_patch) patch_array_t;
  40
  41 static patch_array_t global_inval_patches = NULL;
  42
  43 // Print the current source location for debugging purposes
  44 RBIMPL_ATTR_MAYBE_UNUSED()
  45 static void
  46 jit_print_loc(jitstate_t *jit, const char *msg)
  47 {
  48     char *ptr;
  49     long len;
  50     VALUE path = rb_iseq_path(jit->iseq);
  51     RSTRING_GETMEM(path, ptr, len);
  52     fprintf(stderr, "%s %.*s:%u\n", msg, (int)len, ptr, rb_iseq_line_no(jit->iseq, jit->insn_idx));
  53 }
  54
  55 // dump an object for debugging purposes
  56 RBIMPL_ATTR_MAYBE_UNUSED()
  57 static void
  58 jit_obj_info_dump(codeblock_t *cb, x86opnd_t opnd) {
  59     push_regs(cb);
  60     mov(cb, C_ARG_REGS[0], opnd);
  61     call_ptr(cb, REG0, (void *)rb_obj_info_dump);
  62     pop_regs(cb);
  63 }
  64
  65 // Get the current instruction's opcode
  66 static int
  67 jit_get_opcode(jitstate_t *jit)
  68 {
  69     return jit->opcode;
  70 }
  71
  72 // Get the index of the next instruction
  73 static uint32_t
  74 jit_next_insn_idx(jitstate_t *jit)
  75 {
  76     return jit->insn_idx + insn_len(jit_get_opcode(jit));
  77 }
  78
  79 // Get an instruction argument by index
  80 static VALUE
  81 jit_get_arg(jitstate_t *jit, size_t arg_idx)
  82 {
  83     RUBY_ASSERT(arg_idx + 1 < (size_t)insn_len(jit_get_opcode(jit)));
  84     return *(jit->pc + arg_idx + 1);
  85 }
  86
  87 // Load a VALUE into a register and keep track of the reference if it is on the GC heap.
  88 static void
  89 jit_mov_gc_ptr(jitstate_t *jit, codeblock_t *cb, x86opnd_t reg, VALUE ptr)
  90 {
  91     RUBY_ASSERT(reg.type == OPND_REG && reg.num_bits == 64);
  92
  93     // Load the pointer constant into the specified register
  94     mov(cb, reg, const_ptr_opnd((void*)ptr));
  95
  96     // The pointer immediate is encoded as the last part of the mov written out
  97     uint32_t ptr_offset = cb->write_pos - sizeof(VALUE);
  98
  99     if (!SPECIAL_CONST_P(ptr)) {
 100         if (!rb_darray_append(&jit->block->gc_object_offsets, ptr_offset)) {
 101             rb_bug("allocation failed");
 102         }
 103     }
 104 }
 105
 106 // Check if we are compiling the instruction at the stub PC
 107 // Meaning we are compiling the instruction that is next to execute
 108 static bool
 109 jit_at_current_insn(jitstate_t *jit)
 110 {
 111     const VALUE *ec_pc = jit->ec->cfp->pc;
 112     return (ec_pc == jit->pc);
 113 }
 114
 115 // Peek at the nth topmost value on the Ruby stack.
 116 // Returns the topmost value when n == 0.
 117 static VALUE
 118 jit_peek_at_stack(jitstate_t *jit, ctx_t *ctx, int n)
 119 {
 120     RUBY_ASSERT(jit_at_current_insn(jit));
 121
 122     // Note: this does not account for ctx->sp_offset because
 123     // this is only available when hitting a stub, and while
 124     // hitting a stub, cfp->sp needs to be up to date in case
 125     // codegen functions trigger GC. See :stub-sp-flush:.
 126     VALUE *sp = jit->ec->cfp->sp;
 127
 128     return *(sp - 1 - n);
 129 }
 130
 131 static VALUE
 132 jit_peek_at_self(jitstate_t *jit, ctx_t *ctx)
 133 {
 134     return jit->ec->cfp->self;
 135 }
 136
 137 RBIMPL_ATTR_MAYBE_UNUSED()
 138 static VALUE
 139 jit_peek_at_local(jitstate_t *jit, ctx_t *ctx, int n)
 140 {
 141     RUBY_ASSERT(jit_at_current_insn(jit));
 142
 143     int32_t local_table_size = jit->iseq->body->local_table_size;
 144     RUBY_ASSERT(n < (int)jit->iseq->body->local_table_size);
 145
 146     const VALUE *ep = jit->ec->cfp->ep;
 147     return ep[-VM_ENV_DATA_SIZE - local_table_size + n + 1];
 148 }
 149
 150 // Save the incremented PC on the CFP
 151 // This is necessary when calleees can raise or allocate
 152 static void
 153 jit_save_pc(jitstate_t *jit, x86opnd_t scratch_reg)
 154 {
 155     codeblock_t *cb = jit->cb;
 156     mov(cb, scratch_reg, const_ptr_opnd(jit->pc + insn_len(jit->opcode)));
 157     mov(cb, mem_opnd(64, REG_CFP, offsetof(rb_control_frame_t, pc)), scratch_reg);
 158 }
 159
 160 // Save the current SP on the CFP
 161 // This realigns the interpreter SP with the JIT SP
 162 // Note: this will change the current value of REG_SP,
 163 //       which could invalidate memory operands
 164 static void
 165 jit_save_sp(jitstate_t *jit, ctx_t *ctx)
 166 {
 167     if (ctx->sp_offset != 0) {
 168         x86opnd_t stack_pointer = ctx_sp_opnd(ctx, 0);
 169         codeblock_t *cb = jit->cb;
 170         lea(cb, REG_SP, stack_pointer);
 171         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG_SP);
 172         ctx->sp_offset = 0;
 173     }
 174 }
 175
 176 // jit_save_pc() + jit_save_sp(). Should be used before calling a routine that
 177 // could:
 178 //  - Perform GC allocation
 179 //  - Take the VM lock through RB_VM_LOCK_ENTER()
 180 //  - Perform Ruby method call
 181 static void
 182 jit_prepare_routine_call(jitstate_t *jit, ctx_t *ctx, x86opnd_t scratch_reg)
 183 {
 184     jit->record_boundary_patch_point = true;
 185     jit_save_pc(jit, scratch_reg);
 186     jit_save_sp(jit, ctx);
 187
 188     // In case the routine calls Ruby methods, it can set local variables
 189     // through Kernel#binding and other means.
 190     ctx_clear_local_types(ctx);
 191 }
 192
 193 // Record the current codeblock write position for rewriting into a jump into
 194 // the outlined block later. Used to implement global code invalidation.
 195 static void
 196 record_global_inval_patch(const codeblock_t *cb, uint32_t outline_block_target_pos)
 197 {
 198     struct codepage_patch patch_point = { cb->write_pos, outline_block_target_pos };
 199     if (!rb_darray_append(&global_inval_patches, patch_point)) rb_bug("allocation failed");
 200 }
 201
 202 static bool jit_guard_known_klass(jitstate_t *jit, ctx_t *ctx, VALUE known_klass, insn_opnd_t insn_opnd, VALUE sample_instance, const int max_chain_depth, uint8_t *side_exit);
 203
 204 #if YJIT_STATS
 205
 206 // Add a comment at the current position in the code block
 207 static void
 208 _add_comment(codeblock_t *cb, const char *comment_str)
 209 {
 210     // We can't add comments to the outlined code block
 211     if (cb == ocb)
 212         return;
 213
 214     // Avoid adding duplicate comment strings (can happen due to deferred codegen)
 215     size_t num_comments = rb_darray_size(yjit_code_comments);
 216     if (num_comments > 0) {
 217         struct yjit_comment last_comment = rb_darray_get(yjit_code_comments, num_comments - 1);
 218         if (last_comment.offset == cb->write_pos && strcmp(last_comment.comment, comment_str) == 0) {
 219             return;
 220         }
 221     }
 222
 223     struct yjit_comment new_comment = (struct yjit_comment){ cb->write_pos, comment_str };
 224     rb_darray_append(&yjit_code_comments, new_comment);
 225 }
 226
 227 // Comments for generated machine code
 228 #define ADD_COMMENT(cb, comment) _add_comment((cb), (comment))
 229
 230 // Verify the ctx's types and mappings against the compile-time stack, self,
 231 // and locals.
 232 static void
 233 verify_ctx(jitstate_t *jit, ctx_t *ctx)
 234 {
 235     // Only able to check types when at current insn
 236     RUBY_ASSERT(jit_at_current_insn(jit));
 237
 238     VALUE self_val = jit_peek_at_self(jit, ctx);
 239     if (type_diff(yjit_type_of_value(self_val), ctx->self_type) == INT_MAX) {
 240         rb_bug("verify_ctx: ctx type (%s) incompatible with actual value of self: %s", yjit_type_name(ctx->self_type), rb_obj_info(self_val));
 241     }
 242
 243     for (int i = 0; i < ctx->stack_size && i < MAX_TEMP_TYPES; i++) {
 244         temp_type_mapping_t learned = ctx_get_opnd_mapping(ctx, OPND_STACK(i));
 245         VALUE val = jit_peek_at_stack(jit, ctx, i);
 246         val_type_t detected = yjit_type_of_value(val);
 247
 248         if (learned.mapping.kind == TEMP_SELF) {
 249             if (self_val != val) {
 250                 rb_bug("verify_ctx: stack value was mapped to self, but values did not match\n"
 251                         "  stack: %s\n"
 252                         "  self: %s",
 253                         rb_obj_info(val),
 254                         rb_obj_info(self_val));
 255             }
 256         }
 257
 258         if (learned.mapping.kind == TEMP_LOCAL) {
 259             int local_idx = learned.mapping.idx;
 260             VALUE local_val = jit_peek_at_local(jit, ctx, local_idx);
 261             if (local_val != val) {
 262                 rb_bug("verify_ctx: stack value was mapped to local, but values did not match\n"
 263                         "  stack: %s\n"
 264                         "  local %i: %s",
 265                         rb_obj_info(val),
 266                         local_idx,
 267                         rb_obj_info(local_val));
 268             }
 269         }
 270
 271         if (type_diff(detected, learned.type) == INT_MAX) {
 272             rb_bug("verify_ctx: ctx type (%s) incompatible with actual value on stack: %s", yjit_type_name(learned.type), rb_obj_info(val));
 273         }
 274     }
 275
 276     int32_t local_table_size = jit->iseq->body->local_table_size;
 277     for (int i = 0; i < local_table_size && i < MAX_TEMP_TYPES; i++) {
 278         val_type_t learned = ctx->local_types[i];
 279         VALUE val = jit_peek_at_local(jit, ctx, i);
 280         val_type_t detected = yjit_type_of_value(val);
 281
 282         if (type_diff(detected, learned) == INT_MAX) {
 283             rb_bug("verify_ctx: ctx type (%s) incompatible with actual value of local: %s", yjit_type_name(learned), rb_obj_info(val));
 284         }
 285     }
 286 }
 287
 288 #else
 289
 290 #define ADD_COMMENT(cb, comment) ((void)0)
 291 #define verify_ctx(jit, ctx) ((void)0)
 292
 293 #endif // if YJIT_STATS
 294
 295 #if YJIT_STATS
 296
 297 // Increment a profiling counter with counter_name
 298 #define GEN_COUNTER_INC(cb, counter_name) _gen_counter_inc(cb, &(yjit_runtime_counters . counter_name))
 299 static void
 300 _gen_counter_inc(codeblock_t *cb, int64_t *counter)
 301 {
 302     if (!rb_yjit_opts.gen_stats) return;
 303
 304     // Use REG1 because there might be return value in REG0
 305     mov(cb, REG1, const_ptr_opnd(counter));
 306     cb_write_lock_prefix(cb); // for ractors.
 307     add(cb, mem_opnd(64, REG1, 0), imm_opnd(1));
 308 }
 309
 310 // Increment a counter then take an existing side exit.
 311 #define COUNTED_EXIT(jit, side_exit, counter_name) _counted_side_exit(jit, side_exit, &(yjit_runtime_counters . counter_name))
 312 static uint8_t *
 313 _counted_side_exit(jitstate_t* jit, uint8_t *existing_side_exit, int64_t *counter)
 314 {
 315     if (!rb_yjit_opts.gen_stats) return existing_side_exit;
 316
 317     uint8_t *start = cb_get_ptr(jit->ocb, jit->ocb->write_pos);
 318     _gen_counter_inc(jit->ocb, counter);
 319     jmp_ptr(jit->ocb, existing_side_exit);
 320     return start;
 321 }
 322
 323 #else
 324
 325 #define GEN_COUNTER_INC(cb, counter_name) ((void)0)
 326 #define COUNTED_EXIT(jit, side_exit, counter_name) side_exit
 327
 328 #endif // if YJIT_STATS
 329
 330 // Generate an exit to return to the interpreter
 331 static uint32_t
 332 yjit_gen_exit(VALUE *exit_pc, ctx_t *ctx, codeblock_t *cb)
 333 {
 334     const uint32_t code_pos = cb->write_pos;
 335
 336     ADD_COMMENT(cb, "exit to interpreter");
 337
 338     // Generate the code to exit to the interpreters
 339     // Write the adjusted SP back into the CFP
 340     if (ctx->sp_offset != 0) {
 341         x86opnd_t stack_pointer = ctx_sp_opnd(ctx, 0);
 342         lea(cb, REG_SP, stack_pointer);
 343         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG_SP);
 344     }
 345
 346     // Update CFP->PC
 347     mov(cb, RAX, const_ptr_opnd(exit_pc));
 348     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, pc), RAX);
 349
 350     // Accumulate stats about interpreter exits
 351 #if YJIT_STATS
 352     if (rb_yjit_opts.gen_stats) {
 353         mov(cb, RDI, const_ptr_opnd(exit_pc));
 354         call_ptr(cb, RSI, (void *)&yjit_count_side_exit_op);
 355     }
 356 #endif
 357
 358     pop(cb, REG_SP);
 359     pop(cb, REG_EC);
 360     pop(cb, REG_CFP);
 361
 362     mov(cb, RAX, imm_opnd(Qundef));
 363     ret(cb);
 364
 365     return code_pos;
 366 }
 367
 368 // Generate a continuation for gen_leave() that exits to the interpreter at REG_CFP->pc.
 369 static uint8_t *
 370 yjit_gen_leave_exit(codeblock_t *cb)
 371 {
 372     uint8_t *code_ptr = cb_get_ptr(cb, cb->write_pos);
 373
 374     // Note, gen_leave() fully reconstructs interpreter state and leaves the
 375     // return value in RAX before coming here.
 376
 377     // Every exit to the interpreter should be counted
 378     GEN_COUNTER_INC(cb, leave_interp_return);
 379
 380     pop(cb, REG_SP);
 381     pop(cb, REG_EC);
 382     pop(cb, REG_CFP);
 383
 384     ret(cb);
 385
 386     return code_ptr;
 387 }
 388
 389 // Fill code_for_exit_from_stub. This is used by branch_stub_hit() to exit
 390 // to the interpreter when it cannot service a stub by generating new code.
 391 // Before coming here, branch_stub_hit() takes care of fully reconstructing
 392 // interpreter state.
 393 static void
 394 gen_code_for_exit_from_stub(void)
 395 {
 396     codeblock_t *cb = ocb;
 397     code_for_exit_from_stub = cb_get_ptr(cb, cb->write_pos);
 398
 399     GEN_COUNTER_INC(cb, exit_from_branch_stub);
 400
 401     pop(cb, REG_SP);
 402     pop(cb, REG_EC);
 403     pop(cb, REG_CFP);
 404
 405     mov(cb, RAX, imm_opnd(Qundef));
 406     ret(cb);
 407 }
 408
 409 // :side-exit:
 410 // Get an exit for the current instruction in the outlined block. The code
 411 // for each instruction often begins with several guards before proceeding
 412 // to do work. When guards fail, an option we have is to exit to the
 413 // interpreter at an instruction boundary. The piece of code that takes
 414 // care of reconstructing interpreter state and exiting out of generated
 415 // code is called the side exit.
 416 //
 417 // No guards change the logic for reconstructing interpreter state at the
 418 // moment, so there is one unique side exit for each context. Note that
 419 // it's incorrect to jump to the side exit after any ctx stack push/pop operations
 420 // since they change the logic required for reconstructing interpreter state.
 421 static uint8_t *
 422 yjit_side_exit(jitstate_t *jit, ctx_t *ctx)
 423 {
 424     if (!jit->side_exit_for_pc) {
 425         codeblock_t *ocb = jit->ocb;
 426         uint32_t pos = yjit_gen_exit(jit->pc, ctx, ocb);
 427         jit->side_exit_for_pc = cb_get_ptr(ocb, pos);
 428     }
 429
 430     return jit->side_exit_for_pc;
 431 }
 432
 433 // Ensure that there is an exit for the start of the block being compiled.
 434 // Block invalidation uses this exit.
 435 static void
 436 jit_ensure_block_entry_exit(jitstate_t *jit)
 437 {
 438     block_t *block = jit->block;
 439     if (block->entry_exit) return;
 440
 441     if (jit->insn_idx == block->blockid.idx) {
 442         // We are compiling the first instruction in the block.
 443         // Generate the exit with the cache in jitstate.
 444         block->entry_exit = yjit_side_exit(jit, &block->ctx);
 445     }
 446     else {
 447         VALUE *pc = yjit_iseq_pc_at_idx(block->blockid.iseq, block->blockid.idx);
 448         uint32_t pos = yjit_gen_exit(pc, &block->ctx, ocb);
 449         block->entry_exit = cb_get_ptr(ocb, pos);
 450     }
 451 }
 452
 453 // Generate a runtime guard that ensures the PC is at the start of the iseq,
 454 // otherwise take a side exit.  This is to handle the situation of optional
 455 // parameters.  When a function with optional parameters is called, the entry
 456 // PC for the method isn't necessarily 0, but we always generated code that
 457 // assumes the entry point is 0.
 458 static void
 459 yjit_pc_guard(codeblock_t *cb, const rb_iseq_t *iseq)
 460 {
 461     RUBY_ASSERT(cb != NULL);
 462
 463     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, pc));
 464     mov(cb, REG1, const_ptr_opnd(iseq->body->iseq_encoded));
 465     xor(cb, REG0, REG1);
 466
 467     // xor should impact ZF, so we can jz here
 468     uint32_t pc_is_zero = cb_new_label(cb, "pc_is_zero");
 469     jz_label(cb, pc_is_zero);
 470
 471     // We're not starting at the first PC, so we need to exit.
 472     GEN_COUNTER_INC(cb, leave_start_pc_non_zero);
 473
 474     pop(cb, REG_SP);
 475     pop(cb, REG_EC);
 476     pop(cb, REG_CFP);
 477
 478     mov(cb, RAX, imm_opnd(Qundef));
 479     ret(cb);
 480
 481     // PC should be at the beginning
 482     cb_write_label(cb, pc_is_zero);
 483     cb_link_labels(cb);
 484 }
 485
 486 // The code we generate in gen_send_cfunc() doesn't fire the c_return TracePoint event
 487 // like the interpreter. When tracing for c_return is enabled, we patch the code after
 488 // the C method return to call into this to fire the event.
 489 static void
 490 full_cfunc_return(rb_execution_context_t *ec, VALUE return_value)
 491 {
 492     rb_control_frame_t *cfp = ec->cfp;
 493     RUBY_ASSERT_ALWAYS(cfp == GET_EC()->cfp);
 494     const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(cfp);
 495
 496     RUBY_ASSERT_ALWAYS(RUBYVM_CFUNC_FRAME_P(cfp));
 497     RUBY_ASSERT_ALWAYS(me->def->type == VM_METHOD_TYPE_CFUNC);
 498
 499     // CHECK_CFP_CONSISTENCY("full_cfunc_return"); TODO revive this
 500
 501     // Pop the C func's frame and fire the c_return TracePoint event
 502     // Note that this is the same order as vm_call_cfunc_with_frame().
 503     rb_vm_pop_frame(ec);
 504     EXEC_EVENT_HOOK(ec, RUBY_EVENT_C_RETURN, cfp->self, me->def->original_id, me->called_id, me->owner, return_value);
 505     // Note, this deviates from the interpreter in that users need to enable
 506     // a c_return TracePoint for this DTrace hook to work. A reasonable change
 507     // since the Ruby return event works this way as well.
 508     RUBY_DTRACE_CMETHOD_RETURN_HOOK(ec, me->owner, me->def->original_id);
 509
 510     // Push return value into the caller's stack. We know that it's a frame that
 511     // uses cfp->sp because we are patching a call done with gen_send_cfunc().
 512     ec->cfp->sp[0] = return_value;
 513     ec->cfp->sp++;
 514 }
 515
 516 // Landing code for when c_return tracing is enabled. See full_cfunc_return().
 517 static void
 518 gen_full_cfunc_return(void)
 519 {
 520     codeblock_t *cb = ocb;
 521     outline_full_cfunc_return_pos = ocb->write_pos;
 522
 523     // This chunk of code expect REG_EC to be filled properly and
 524     // RAX to contain the return value of the C method.
 525
 526     // Call full_cfunc_return()
 527     mov(cb, C_ARG_REGS[0], REG_EC);
 528     mov(cb, C_ARG_REGS[1], RAX);
 529     call_ptr(cb, REG0, (void *)full_cfunc_return);
 530
 531     // Count the exit
 532     GEN_COUNTER_INC(cb, traced_cfunc_return);
 533
 534     // Return to the interpreter
 535     pop(cb, REG_SP);
 536     pop(cb, REG_EC);
 537     pop(cb, REG_CFP);
 538
 539     mov(cb, RAX, imm_opnd(Qundef));
 540     ret(cb);
 541 }
 542
 543 /*
 544 Compile an interpreter entry block to be inserted into an iseq
 545 Returns `NULL` if compilation fails.
 546 */
 547 static uint8_t *
 548 yjit_entry_prologue(codeblock_t *cb, const rb_iseq_t *iseq)
 549 {
 550     RUBY_ASSERT(cb != NULL);
 551
 552     enum { MAX_PROLOGUE_SIZE = 1024 };
 553
 554     // Check if we have enough executable memory
 555     if (cb->write_pos + MAX_PROLOGUE_SIZE >= cb->mem_size) {
 556         return NULL;
 557     }
 558
 559     const uint32_t old_write_pos = cb->write_pos;
 560
 561     // Align the current write position to cache line boundaries
 562     cb_align_pos(cb, 64);
 563
 564     uint8_t *code_ptr = cb_get_ptr(cb, cb->write_pos);
 565     ADD_COMMENT(cb, "yjit entry");
 566
 567     push(cb, REG_CFP);
 568     push(cb, REG_EC);
 569     push(cb, REG_SP);
 570
 571     // We are passed EC and CFP
 572     mov(cb, REG_EC, C_ARG_REGS[0]);
 573     mov(cb, REG_CFP, C_ARG_REGS[1]);
 574
 575     // Load the current SP from the CFP into REG_SP
 576     mov(cb, REG_SP, member_opnd(REG_CFP, rb_control_frame_t, sp));
 577
 578     // Setup cfp->jit_return
 579     // TODO: this could use an IP relative LEA instead of an 8 byte immediate
 580     mov(cb, REG0, const_ptr_opnd(leave_exit_code));
 581     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, jit_return), REG0);
 582
 583     // We're compiling iseqs that we *expect* to start at `insn_idx`. But in
 584     // the case of optional parameters, the interpreter can set the pc to a
 585     // different location depending on the optional parameters.  If an iseq
 586     // has optional parameters, we'll add a runtime check that the PC we've
 587     // compiled for is the same PC that the interpreter wants us to run with.
 588     // If they don't match, then we'll take a side exit.
 589     if (iseq->body->param.flags.has_opt) {
 590         yjit_pc_guard(cb, iseq);
 591     }
 592
 593     // Verify MAX_PROLOGUE_SIZE
 594     RUBY_ASSERT_ALWAYS(cb->write_pos - old_write_pos <= MAX_PROLOGUE_SIZE);
 595
 596     return code_ptr;
 597 }
 598
 599 // Generate code to check for interrupts and take a side-exit.
 600 // Warning: this function clobbers REG0
 601 static void
 602 yjit_check_ints(codeblock_t *cb, uint8_t *side_exit)
 603 {
 604     // Check for interrupts
 605     // see RUBY_VM_CHECK_INTS(ec) macro
 606     ADD_COMMENT(cb, "RUBY_VM_CHECK_INTS(ec)");
 607     mov(cb, REG0_32, member_opnd(REG_EC, rb_execution_context_t, interrupt_mask));
 608     not(cb, REG0_32);
 609     test(cb, member_opnd(REG_EC, rb_execution_context_t, interrupt_flag), REG0_32);
 610     jnz_ptr(cb, side_exit);
 611 }
 612
 613 // Generate a stubbed unconditional jump to the next bytecode instruction.
 614 // Blocks that are part of a guard chain can use this to share the same successor.
 615 static void
 616 jit_jump_to_next_insn(jitstate_t *jit, const ctx_t *current_context)
 617 {
 618     // Reset the depth since in current usages we only ever jump to to
 619     // chain_depth > 0 from the same instruction.
 620     ctx_t reset_depth = *current_context;
 621     reset_depth.chain_depth = 0;
 622
 623     blockid_t jump_block = { jit->iseq, jit_next_insn_idx(jit) };
 624
 625     // We are at the end of the current instruction. Record the boundary.
 626     if (jit->record_boundary_patch_point) {
 627         uint32_t exit_pos = yjit_gen_exit(jit->pc + insn_len(jit->opcode), &reset_depth, jit->ocb);
 628         record_global_inval_patch(jit->cb, exit_pos);
 629         jit->record_boundary_patch_point = false;
 630     }
 631
 632     // Generate the jump instruction
 633     gen_direct_jump(
 634         jit,
 635         &reset_depth,
 636         jump_block
 637     );
 638 }
 639
 640 // Compile a sequence of bytecode instructions for a given basic block version.
 641 // Part of gen_block_version().
 642 static block_t *
 643 gen_single_block(blockid_t blockid, const ctx_t *start_ctx, rb_execution_context_t *ec)
 644 {
 645     RUBY_ASSERT(cb != NULL);
 646     verify_blockid(blockid);
 647
 648     // Allocate the new block
 649     block_t *block = calloc(1, sizeof(block_t));
 650     if (!block) {
 651         return NULL;
 652     }
 653
 654     // Copy the starting context to avoid mutating it
 655     ctx_t ctx_copy = *start_ctx;
 656     ctx_t *ctx = &ctx_copy;
 657
 658     // Limit the number of specialized versions for this block
 659     *ctx = limit_block_versions(blockid, ctx);
 660
 661     // Save the starting context on the block.
 662     block->blockid = blockid;
 663     block->ctx = *ctx;
 664
 665     RUBY_ASSERT(!(blockid.idx == 0 && start_ctx->stack_size > 0));
 666
 667     const rb_iseq_t *iseq = block->blockid.iseq;
 668     const unsigned int iseq_size = iseq->body->iseq_size;
 669     uint32_t insn_idx = block->blockid.idx;
 670     const uint32_t starting_insn_idx = insn_idx;
 671
 672     // Initialize a JIT state object
 673     jitstate_t jit = {
 674         .cb = cb,
 675         .ocb = ocb,
 676         .block = block,
 677         .iseq = iseq,
 678         .ec = ec
 679     };
 680
 681     // Mark the start position of the block
 682     block->start_addr = cb_get_write_ptr(cb);
 683
 684     // For each instruction to compile
 685     while (insn_idx < iseq_size) {
 686         // Get the current pc and opcode
 687         VALUE *pc = yjit_iseq_pc_at_idx(iseq, insn_idx);
 688         int opcode = yjit_opcode_at_pc(iseq, pc);
 689         RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE);
 690
 691         // opt_getinlinecache wants to be in a block all on its own. Cut the block short
 692         // if we run into it. See gen_opt_getinlinecache() for details.
 693         if (opcode == BIN(opt_getinlinecache) && insn_idx > starting_insn_idx) {
 694             jit_jump_to_next_insn(&jit, ctx);
 695             break;
 696         }
 697
 698         // Set the current instruction
 699         jit.insn_idx = insn_idx;
 700         jit.opcode = opcode;
 701         jit.pc = pc;
 702         jit.side_exit_for_pc = NULL;
 703
 704         // If previous instruction requested to record the boundary
 705         if (jit.record_boundary_patch_point) {
 706             // Generate an exit to this instruction and record it
 707             uint32_t exit_pos = yjit_gen_exit(jit.pc, ctx, ocb);
 708             record_global_inval_patch(cb, exit_pos);
 709             jit.record_boundary_patch_point = false;
 710         }
 711
 712         // Verify our existing assumption (DEBUG)
 713         if (jit_at_current_insn(&jit)) {
 714             verify_ctx(&jit, ctx);
 715         }
 716
 717         // Lookup the codegen function for this instruction
 718         codegen_fn gen_fn = gen_fns[opcode];
 719         codegen_status_t status = YJIT_CANT_COMPILE;
 720         if (gen_fn) {
 721             if (0) {
 722                 fprintf(stderr, "compiling %d: %s\n", insn_idx, insn_name(opcode));
 723                 print_str(cb, insn_name(opcode));
 724             }
 725
 726             // :count-placement:
 727             // Count bytecode instructions that execute in generated code.
 728             // Note that the increment happens even when the output takes side exit.
 729             GEN_COUNTER_INC(cb, exec_instruction);
 730
 731             // Add a comment for the name of the YARV instruction
 732             ADD_COMMENT(cb, insn_name(opcode));
 733
 734             // Call the code generation function
 735             status = gen_fn(&jit, ctx, cb);
 736         }
 737
 738         // If we can't compile this instruction
 739         // exit to the interpreter and stop compiling
 740         if (status == YJIT_CANT_COMPILE) {
 741             // TODO: if the codegen function makes changes to ctx and then return YJIT_CANT_COMPILE,
 742             // the exit this generates would be wrong. We could save a copy of the entry context
 743             // and assert that ctx is the same here.
 744             uint32_t exit_off = yjit_gen_exit(jit.pc, ctx, cb);
 745
 746             // If this is the first instruction in the block, then we can use
 747             // the exit for block->entry_exit.
 748             if (insn_idx == block->blockid.idx) {
 749                 block->entry_exit = cb_get_ptr(cb, exit_off);
 750             }
 751             break;
 752         }
 753
 754         // For now, reset the chain depth after each instruction as only the
 755         // first instruction in the block can concern itself with the depth.
 756         ctx->chain_depth = 0;
 757
 758         // Move to the next instruction to compile
 759         insn_idx += insn_len(opcode);
 760
 761         // If the instruction terminates this block
 762         if (status == YJIT_END_BLOCK) {
 763             break;
 764         }
 765     }
 766
 767     // Mark the end position of the block
 768     block->end_addr = cb_get_write_ptr(cb);
 769
 770     // Store the index of the last instruction in the block
 771     block->end_idx = insn_idx;
 772
 773     // We currently can't handle cases where the request is for a block that
 774     // doesn't go to the next instruction.
 775     RUBY_ASSERT(!jit.record_boundary_patch_point);
 776
 777     // If code for the block doesn't fit, free the block and fail.
 778     if (cb->dropped_bytes || ocb->dropped_bytes) {
 779         yjit_free_block(block);
 780         return NULL;
 781     }
 782
 783     if (YJIT_DUMP_MODE >= 2) {
 784         // Dump list of compiled instrutions
 785         fprintf(stderr, "Compiled the following for iseq=%p:\n", (void *)iseq);
 786         for (uint32_t idx = block->blockid.idx; idx < insn_idx; ) {
 787             int opcode = yjit_opcode_at_pc(iseq, yjit_iseq_pc_at_idx(iseq, idx));
 788             fprintf(stderr, "  %04d %s\n", idx, insn_name(opcode));
 789             idx += insn_len(opcode);
 790         }
 791     }
 792
 793     return block;
 794 }
 795
 796 static codegen_status_t gen_opt_send_without_block(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb);
 797
 798 static codegen_status_t
 799 gen_nop(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 800 {
 801     // Do nothing
 802     return YJIT_KEEP_COMPILING;
 803 }
 804
 805 static codegen_status_t
 806 gen_dup(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 807 {
 808     // Get the top value and its type
 809     x86opnd_t dup_val = ctx_stack_pop(ctx, 0);
 810     temp_type_mapping_t mapping = ctx_get_opnd_mapping(ctx, OPND_STACK(0));
 811
 812     // Push the same value on top
 813     x86opnd_t loc0 = ctx_stack_push_mapping(ctx, mapping);
 814     mov(cb, REG0, dup_val);
 815     mov(cb, loc0, REG0);
 816
 817     return YJIT_KEEP_COMPILING;
 818 }
 819
 820 // duplicate stack top n elements
 821 static codegen_status_t
 822 gen_dupn(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 823 {
 824     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 825
 826     // In practice, seems to be only used for n==2
 827     if (n != 2) {
 828         return YJIT_CANT_COMPILE;
 829     }
 830
 831     x86opnd_t opnd1 = ctx_stack_opnd(ctx, 1);
 832     x86opnd_t opnd0 = ctx_stack_opnd(ctx, 0);
 833     temp_type_mapping_t mapping1 = ctx_get_opnd_mapping(ctx, OPND_STACK(1));
 834     temp_type_mapping_t mapping0 = ctx_get_opnd_mapping(ctx, OPND_STACK(0));
 835
 836     x86opnd_t dst1 = ctx_stack_push_mapping(ctx, mapping1);
 837     mov(cb, REG0, opnd1);
 838     mov(cb, dst1, REG0);
 839
 840     x86opnd_t dst0 = ctx_stack_push_mapping(ctx, mapping0);
 841     mov(cb, REG0, opnd0);
 842     mov(cb, dst0, REG0);
 843
 844     return YJIT_KEEP_COMPILING;
 845 }
 846
 847 static void
 848 stack_swap(ctx_t *ctx, codeblock_t *cb, int offset0, int offset1, x86opnd_t reg0, x86opnd_t reg1)
 849 {
 850     x86opnd_t opnd0 = ctx_stack_opnd(ctx, offset0);
 851     x86opnd_t opnd1 = ctx_stack_opnd(ctx, offset1);
 852
 853     temp_type_mapping_t mapping0 = ctx_get_opnd_mapping(ctx, OPND_STACK(offset0));
 854     temp_type_mapping_t mapping1 = ctx_get_opnd_mapping(ctx, OPND_STACK(offset1));
 855
 856     mov(cb, reg0, opnd0);
 857     mov(cb, reg1, opnd1);
 858     mov(cb, opnd0, reg1);
 859     mov(cb, opnd1, reg0);
 860
 861     ctx_set_opnd_mapping(ctx, OPND_STACK(offset0), mapping1);
 862     ctx_set_opnd_mapping(ctx, OPND_STACK(offset1), mapping0);
 863 }
 864
 865 // Swap top 2 stack entries
 866 static codegen_status_t
 867 gen_swap(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 868 {
 869     stack_swap(ctx , cb, 0, 1, REG0, REG1);
 870     return YJIT_KEEP_COMPILING;
 871 }
 872
 873 // set Nth stack entry to stack top
 874 static codegen_status_t
 875 gen_setn(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 876 {
 877     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 878
 879     // Set the destination
 880     x86opnd_t top_val = ctx_stack_pop(ctx, 0);
 881     x86opnd_t dst_opnd = ctx_stack_opnd(ctx, (int32_t)n);
 882     mov(cb, REG0, top_val);
 883     mov(cb, dst_opnd, REG0);
 884
 885     temp_type_mapping_t mapping = ctx_get_opnd_mapping(ctx, OPND_STACK(0));
 886     ctx_set_opnd_mapping(ctx, OPND_STACK(n), mapping);
 887
 888     return YJIT_KEEP_COMPILING;
 889 }
 890
 891 // get nth stack value, then push it
 892 static codegen_status_t
 893 gen_topn(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 894 {
 895     int32_t n = (int32_t)jit_get_arg(jit, 0);
 896
 897     // Get top n type / operand
 898     x86opnd_t top_n_val = ctx_stack_opnd(ctx, n);
 899     temp_type_mapping_t mapping = ctx_get_opnd_mapping(ctx, OPND_STACK(n));
 900
 901     x86opnd_t loc0 = ctx_stack_push_mapping(ctx, mapping);
 902     mov(cb, REG0, top_n_val);
 903     mov(cb, loc0, REG0);
 904
 905     return YJIT_KEEP_COMPILING;
 906 }
 907
 908 static codegen_status_t
 909 gen_pop(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 910 {
 911     // Decrement SP
 912     ctx_stack_pop(ctx, 1);
 913     return YJIT_KEEP_COMPILING;
 914 }
 915
 916 // Pop n values off the stack
 917 static codegen_status_t
 918 gen_adjuststack(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 919 {
 920     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 921     ctx_stack_pop(ctx, n);
 922     return YJIT_KEEP_COMPILING;
 923 }
 924
 925 // new array initialized from top N values
 926 static codegen_status_t
 927 gen_newarray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 928 {
 929     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 930
 931     // Save the PC and SP because we are allocating
 932     jit_prepare_routine_call(jit, ctx, REG0);
 933
 934     x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(int32_t)(sizeof(VALUE) * (uint32_t)n));
 935
 936     // call rb_ec_ary_new_from_values(struct rb_execution_context_struct *ec, long n, const VALUE *elts);
 937     mov(cb, C_ARG_REGS[0], REG_EC);
 938     mov(cb, C_ARG_REGS[1], imm_opnd(n));
 939     lea(cb, C_ARG_REGS[2], values_ptr);
 940     call_ptr(cb, REG0, (void *)rb_ec_ary_new_from_values);
 941
 942     ctx_stack_pop(ctx, n);
 943     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_ARRAY);
 944     mov(cb, stack_ret, RAX);
 945
 946     return YJIT_KEEP_COMPILING;
 947 }
 948
 949 // dup array
 950 static codegen_status_t
 951 gen_duparray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 952 {
 953     VALUE ary = jit_get_arg(jit, 0);
 954
 955     // Save the PC and SP because we are allocating
 956     jit_prepare_routine_call(jit, ctx, REG0);
 957
 958     // call rb_ary_resurrect(VALUE ary);
 959     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], ary);
 960     call_ptr(cb, REG0, (void *)rb_ary_resurrect);
 961
 962     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_ARRAY);
 963     mov(cb, stack_ret, RAX);
 964
 965     return YJIT_KEEP_COMPILING;
 966 }
 967
 968 // dup hash
 969 static codegen_status_t
 970 gen_duphash(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 971 {
 972     VALUE hash = jit_get_arg(jit, 0);
 973
 974     // Save the PC and SP because we are allocating
 975     jit_prepare_routine_call(jit, ctx, REG0);
 976
 977     // call rb_hash_resurrect(VALUE hash);
 978     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], hash);
 979     call_ptr(cb, REG0, (void *)rb_hash_resurrect);
 980
 981     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HASH);
 982     mov(cb, stack_ret, RAX);
 983
 984     return YJIT_KEEP_COMPILING;
 985 }
 986
 987 VALUE rb_vm_splat_array(VALUE flag, VALUE ary);
 988
 989 // call to_a on the array on the stack
 990 static codegen_status_t
 991 gen_splatarray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 992 {
 993     VALUE flag = (VALUE) jit_get_arg(jit, 0);
 994
 995     // Save the PC and SP because the callee may allocate
 996     // Note that this modifies REG_SP, which is why we do it first
 997     jit_prepare_routine_call(jit, ctx, REG0);
 998
 999     // Get the operands from the stack
1000     x86opnd_t ary_opnd = ctx_stack_pop(ctx, 1);
1001
1002     // Call rb_vm_splat_array(flag, ary)
1003     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], flag);
1004     mov(cb, C_ARG_REGS[1], ary_opnd);
1005     call_ptr(cb, REG1, (void *) rb_vm_splat_array);
1006
1007     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_ARRAY);
1008     mov(cb, stack_ret, RAX);
1009
1010     return YJIT_KEEP_COMPILING;
1011 }
1012
1013 // new range initialized from top 2 values
1014 static codegen_status_t
1015 gen_newrange(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1016 {
1017     rb_num_t flag = (rb_num_t)jit_get_arg(jit, 0);
1018
1019     // rb_range_new() allocates and can raise
1020     jit_prepare_routine_call(jit, ctx, REG0);
1021
1022     // val = rb_range_new(low, high, (int)flag);
1023     mov(cb, C_ARG_REGS[0], ctx_stack_opnd(ctx, 1));
1024     mov(cb, C_ARG_REGS[1], ctx_stack_opnd(ctx, 0));
1025     mov(cb, C_ARG_REGS[2], imm_opnd(flag));
1026     call_ptr(cb, REG0, (void *)rb_range_new);
1027
1028     ctx_stack_pop(ctx, 2);
1029     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HEAP);
1030     mov(cb, stack_ret, RAX);
1031
1032     return YJIT_KEEP_COMPILING;
1033 }
1034
1035 static void
1036 guard_object_is_heap(codeblock_t *cb, x86opnd_t object_opnd, ctx_t *ctx, uint8_t *side_exit)
1037 {
1038     ADD_COMMENT(cb, "guard object is heap");
1039
1040     // Test that the object is not an immediate
1041     test(cb, object_opnd, imm_opnd(RUBY_IMMEDIATE_MASK));
1042     jnz_ptr(cb, side_exit);
1043
1044     // Test that the object is not false or nil
1045     cmp(cb, object_opnd, imm_opnd(Qnil));
1046     RUBY_ASSERT(Qfalse < Qnil);
1047     jbe_ptr(cb, side_exit);
1048 }
1049
1050 static inline void
1051 guard_object_is_array(codeblock_t *cb, x86opnd_t object_opnd, x86opnd_t flags_opnd, ctx_t *ctx, uint8_t *side_exit)
1052 {
1053     ADD_COMMENT(cb, "guard object is array");
1054
1055     // Pull out the type mask
1056     mov(cb, flags_opnd, member_opnd(object_opnd, struct RBasic, flags));
1057     and(cb, flags_opnd, imm_opnd(RUBY_T_MASK));
1058
1059     // Compare the result with T_ARRAY
1060     cmp(cb, flags_opnd, imm_opnd(T_ARRAY));
1061     jne_ptr(cb, side_exit);
1062 }
1063
1064 // push enough nils onto the stack to fill out an array
1065 static codegen_status_t
1066 gen_expandarray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1067 {
1068     int flag = (int) jit_get_arg(jit, 1);
1069
1070     // If this instruction has the splat flag, then bail out.
1071     if (flag & 0x01) {
1072         GEN_COUNTER_INC(cb, expandarray_splat);
1073         return YJIT_CANT_COMPILE;
1074     }
1075
1076     // If this instruction has the postarg flag, then bail out.
1077     if (flag & 0x02) {
1078         GEN_COUNTER_INC(cb, expandarray_postarg);
1079         return YJIT_CANT_COMPILE;
1080     }
1081
1082     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1083
1084     // num is the number of requested values. If there aren't enough in the
1085     // array then we're going to push on nils.
1086     int num = (int)jit_get_arg(jit, 0);
1087     val_type_t array_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1088     x86opnd_t array_opnd = ctx_stack_pop(ctx, 1);
1089
1090     if (array_type.type == ETYPE_NIL) {
1091         // special case for a, b = nil pattern
1092         // push N nils onto the stack
1093         for (int i = 0; i < num; i++) {
1094             x86opnd_t push = ctx_stack_push(ctx, TYPE_NIL);
1095             mov(cb, push, imm_opnd(Qnil));
1096         }
1097         return YJIT_KEEP_COMPILING;
1098     }
1099
1100     // Move the array from the stack into REG0 and check that it's an array.
1101     mov(cb, REG0, array_opnd);
1102     guard_object_is_heap(cb, REG0, ctx, COUNTED_EXIT(jit, side_exit, expandarray_not_array));
1103     guard_object_is_array(cb, REG0, REG1, ctx, COUNTED_EXIT(jit, side_exit, expandarray_not_array));
1104
1105     // If we don't actually want any values, then just return.
1106     if (num == 0) {
1107         return YJIT_KEEP_COMPILING;
1108     }
1109
1110     // Pull out the embed flag to check if it's an embedded array.
1111     x86opnd_t flags_opnd = member_opnd(REG0, struct RBasic, flags);
1112     mov(cb, REG1, flags_opnd);
1113
1114     // Move the length of the embedded array into REG1.
1115     and(cb, REG1, imm_opnd(RARRAY_EMBED_LEN_MASK));
1116     shr(cb, REG1, imm_opnd(RARRAY_EMBED_LEN_SHIFT));
1117
1118     // Conditionally move the length of the heap array into REG1.
1119     test(cb, flags_opnd, imm_opnd(RARRAY_EMBED_FLAG));
1120     cmovz(cb, REG1, member_opnd(REG0, struct RArray, as.heap.len));
1121
1122     // Only handle the case where the number of values in the array is greater
1123     // than or equal to the number of values requested.
1124     cmp(cb, REG1, imm_opnd(num));
1125     jl_ptr(cb, COUNTED_EXIT(jit, side_exit, expandarray_rhs_too_small));
1126
1127     // Load the address of the embedded array into REG1.
1128     // (struct RArray *)(obj)->as.ary
1129     lea(cb, REG1, member_opnd(REG0, struct RArray, as.ary));
1130
1131     // Conditionally load the address of the heap array into REG1.
1132     // (struct RArray *)(obj)->as.heap.ptr
1133     test(cb, flags_opnd, imm_opnd(RARRAY_EMBED_FLAG));
1134     cmovz(cb, REG1, member_opnd(REG0, struct RArray, as.heap.ptr));
1135
1136     // Loop backward through the array and push each element onto the stack.
1137     for (int32_t i = (int32_t) num - 1; i >= 0; i--) {
1138         x86opnd_t top = ctx_stack_push(ctx, TYPE_UNKNOWN);
1139         mov(cb, REG0, mem_opnd(64, REG1, i * SIZEOF_VALUE));
1140         mov(cb, top, REG0);
1141     }
1142
1143     return YJIT_KEEP_COMPILING;
1144 }
1145
1146 // new hash initialized from top N values
1147 static codegen_status_t
1148 gen_newhash(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1149 {
1150     int32_t num = (int32_t)jit_get_arg(jit, 0);
1151
1152     // Save the PC and SP because we are allocating
1153     jit_prepare_routine_call(jit, ctx, REG0);
1154
1155     if (num) {
1156         // val = rb_hash_new_with_size(num / 2);
1157         mov(cb, C_ARG_REGS[0], imm_opnd(num / 2));
1158         call_ptr(cb, REG0, (void *)rb_hash_new_with_size);
1159
1160         // save the allocated hash as we want to push it after insertion
1161         push(cb, RAX);
1162         push(cb, RAX); // alignment
1163
1164         // rb_hash_bulk_insert(num, STACK_ADDR_FROM_TOP(num), val);
1165         mov(cb, C_ARG_REGS[0], imm_opnd(num));
1166         lea(cb, C_ARG_REGS[1], ctx_stack_opnd(ctx, num - 1));
1167         mov(cb, C_ARG_REGS[2], RAX);
1168         call_ptr(cb, REG0, (void *)rb_hash_bulk_insert);
1169
1170         pop(cb, RAX); // alignment
1171         pop(cb, RAX);
1172
1173         ctx_stack_pop(ctx, num);
1174         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HASH);
1175         mov(cb, stack_ret, RAX);
1176     }
1177     else {
1178         // val = rb_hash_new();
1179         call_ptr(cb, REG0, (void *)rb_hash_new);
1180
1181         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HASH);
1182         mov(cb, stack_ret, RAX);
1183     }
1184
1185     return YJIT_KEEP_COMPILING;
1186 }
1187
1188 // Push a constant value to the stack, including type information.
1189 // The constant may be a heap object or a special constant.
1190 static void
1191 jit_putobject(jitstate_t *jit, ctx_t *ctx, VALUE arg)
1192 {
1193     val_type_t val_type = yjit_type_of_value(arg);
1194     x86opnd_t stack_top = ctx_stack_push(ctx, val_type);
1195
1196     if (SPECIAL_CONST_P(arg)) {
1197         // Immediates will not move and do not need to be tracked for GC
1198         // Thanks to this we can mov directly to memory when possible.
1199
1200         // NOTE: VALUE -> int64_t cast below is implementation defined.
1201         // Hopefully it preserves the the bit pattern or raise a signal.
1202         // See N1256 section 6.3.1.3.
1203         x86opnd_t imm = imm_opnd((int64_t)arg);
1204
1205         // 64-bit immediates can't be directly written to memory
1206         if (imm.num_bits <= 32) {
1207             mov(cb, stack_top, imm);
1208         }
1209         else {
1210             mov(cb, REG0, imm);
1211             mov(cb, stack_top, REG0);
1212         }
1213     }
1214     else {
1215         // Load the value to push into REG0
1216         // Note that this value may get moved by the GC
1217         jit_mov_gc_ptr(jit, cb, REG0, arg);
1218
1219         // Write argument at SP
1220         mov(cb, stack_top, REG0);
1221     }
1222 }
1223
1224 static codegen_status_t
1225 gen_putnil(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1226 {
1227     jit_putobject(jit, ctx, Qnil);
1228     return YJIT_KEEP_COMPILING;
1229 }
1230
1231 static codegen_status_t
1232 gen_putobject(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1233 {
1234     VALUE arg = jit_get_arg(jit, 0);
1235
1236     jit_putobject(jit, ctx, arg);
1237     return YJIT_KEEP_COMPILING;
1238 }
1239
1240 static codegen_status_t
1241 gen_putstring(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1242 {
1243     VALUE put_val = jit_get_arg(jit, 0);
1244
1245     // Save the PC and SP because the callee will allocate
1246     jit_prepare_routine_call(jit, ctx, REG0);
1247
1248     mov(cb, C_ARG_REGS[0], REG_EC);
1249     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[1], put_val);
1250     call_ptr(cb, REG0, (void *)rb_ec_str_resurrect);
1251
1252     x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_STRING);
1253     mov(cb, stack_top, RAX);
1254
1255     return YJIT_KEEP_COMPILING;
1256 }
1257
1258 static codegen_status_t
1259 gen_putobject_int2fix(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1260 {
1261     int opcode = jit_get_opcode(jit);
1262     int cst_val = (opcode == BIN(putobject_INT2FIX_0_))? 0:1;
1263
1264     jit_putobject(jit, ctx, INT2FIX(cst_val));
1265     return YJIT_KEEP_COMPILING;
1266 }
1267
1268 static codegen_status_t
1269 gen_putself(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1270 {
1271     // Load self from CFP
1272     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, self));
1273
1274     // Write it on the stack
1275     x86opnd_t stack_top = ctx_stack_push_self(ctx);
1276     mov(cb, stack_top, REG0);
1277
1278     return YJIT_KEEP_COMPILING;
1279 }
1280
1281 static codegen_status_t
1282 gen_putspecialobject(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1283 {
1284     enum vm_special_object_type type = (enum vm_special_object_type)jit_get_arg(jit, 0);
1285
1286     if (type == VM_SPECIAL_OBJECT_VMCORE) {
1287         x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_HEAP);
1288         jit_mov_gc_ptr(jit, cb, REG0, rb_mRubyVMFrozenCore);
1289         mov(cb, stack_top, REG0);
1290         return YJIT_KEEP_COMPILING;
1291     }
1292     else {
1293         // TODO: implement for VM_SPECIAL_OBJECT_CBASE and
1294         // VM_SPECIAL_OBJECT_CONST_BASE
1295         return YJIT_CANT_COMPILE;
1296     }
1297 }
1298
1299 // Get EP at level from CFP
1300 static void
1301 gen_get_ep(codeblock_t *cb, x86opnd_t reg, uint32_t level)
1302 {
1303     // Load environment pointer EP from CFP
1304     mov(cb, reg, member_opnd(REG_CFP, rb_control_frame_t, ep));
1305
1306     while (level--) {
1307         // Get the previous EP from the current EP
1308         // See GET_PREV_EP(ep) macro
1309         // VALUE *prev_ep = ((VALUE *)((ep)[VM_ENV_DATA_INDEX_SPECVAL] & ~0x03))
1310         mov(cb, reg, mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_SPECVAL));
1311         and(cb, reg, imm_opnd(~0x03));
1312     }
1313 }
1314
1315 // Compute the local table index of a variable from its index relative to the
1316 // environment object.
1317 static uint32_t
1318 slot_to_local_idx(const rb_iseq_t *iseq, int32_t slot_idx)
1319 {
1320     // Layout illustration
1321     // This is an array of VALUE
1322     //                                           | VM_ENV_DATA_SIZE |
1323     //                                           v                  v
1324     // low addr <+-------+-------+-------+-------+------------------+
1325     //           |local 0|local 1|  ...  |local n|       ....       |
1326     //           +-------+-------+-------+-------+------------------+
1327     //           ^       ^                       ^                  ^
1328     //           +-------+---local_table_size----+         cfp->ep--+
1329     //                   |                                          |
1330     //                   +------------------slot_idx----------------+
1331     //
1332     // See usages of local_var_name() from iseq.c for similar calculation.
1333
1334     // FIXME: unsigned to signed cast below can truncate
1335     int32_t local_table_size = iseq->body->local_table_size;
1336     int32_t op = slot_idx - VM_ENV_DATA_SIZE;
1337     int32_t local_idx = local_table_size - op - 1;
1338     RUBY_ASSERT(local_idx >= 0 && local_idx < local_table_size);
1339     return (uint32_t)local_idx;
1340 }
1341
1342 static codegen_status_t
1343 gen_getlocal_wc0(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1344 {
1345     // Compute the offset from BP to the local
1346     // TODO: Type is lindex_t in interpter. The following cast can truncate.
1347     //       Not in the mood to dance around signed multiplication UB at the moment...
1348     int32_t slot_idx = (int32_t)jit_get_arg(jit, 0);
1349     const int32_t offs = -(SIZEOF_VALUE * slot_idx);
1350     uint32_t local_idx = slot_to_local_idx(jit->iseq, slot_idx);
1351
1352     // Load environment pointer EP (level 0) from CFP
1353     gen_get_ep(cb, REG0, 0);
1354
1355     // Load the local from the EP
1356     mov(cb, REG0, mem_opnd(64, REG0, offs));
1357
1358     // Write the local at SP
1359     x86opnd_t stack_top = ctx_stack_push_local(ctx, local_idx);
1360     mov(cb, stack_top, REG0);
1361
1362     return YJIT_KEEP_COMPILING;
1363 }
1364
1365 static codegen_status_t
1366 gen_getlocal_generic(ctx_t *ctx, uint32_t local_idx, uint32_t level)
1367 {
1368     gen_get_ep(cb, REG0, level);
1369
1370     // Load the local from the block
1371     // val = *(vm_get_ep(GET_EP(), level) - idx);
1372     const int32_t offs = -(int32_t)(SIZEOF_VALUE * local_idx);
1373     mov(cb, REG0, mem_opnd(64, REG0, offs));
1374
1375     // Write the local at SP
1376     x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN);
1377     mov(cb, stack_top, REG0);
1378
1379     return YJIT_KEEP_COMPILING;
1380 }
1381
1382 static codegen_status_t
1383 gen_getlocal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1384 {
1385     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1386     int32_t level = (int32_t)jit_get_arg(jit, 1);
1387     return gen_getlocal_generic(ctx, idx, level);
1388 }
1389
1390 static codegen_status_t
1391 gen_getlocal_wc1(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1392 {
1393     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1394     return gen_getlocal_generic(ctx, idx, 1);
1395 }
1396
1397 static codegen_status_t
1398 gen_setlocal_wc0(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1399 {
1400     /*
1401     vm_env_write(const VALUE *ep, int index, VALUE v)
1402     {
1403         VALUE flags = ep[VM_ENV_DATA_INDEX_FLAGS];
1404         if (LIKELY((flags & VM_ENV_FLAG_WB_REQUIRED) == 0)) {
1405             VM_STACK_ENV_WRITE(ep, index, v);
1406         }
1407         else {
1408             vm_env_write_slowpath(ep, index, v);
1409         }
1410     }
1411     */
1412
1413     int32_t slot_idx = (int32_t)jit_get_arg(jit, 0);
1414     uint32_t local_idx = slot_to_local_idx(jit->iseq, slot_idx);
1415
1416     // Load environment pointer EP (level 0) from CFP
1417     gen_get_ep(cb, REG0, 0);
1418
1419     // flags & VM_ENV_FLAG_WB_REQUIRED
1420     x86opnd_t flags_opnd = mem_opnd(64, REG0, sizeof(VALUE) * VM_ENV_DATA_INDEX_FLAGS);
1421     test(cb, flags_opnd, imm_opnd(VM_ENV_FLAG_WB_REQUIRED));
1422
1423     // Create a side-exit to fall back to the interpreter
1424     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1425
1426     // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
1427     jnz_ptr(cb, side_exit);
1428
1429     // Set the type of the local variable in the context
1430     val_type_t temp_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1431     ctx_set_local_type(ctx, local_idx, temp_type);
1432
1433     // Pop the value to write from the stack
1434     x86opnd_t stack_top = ctx_stack_pop(ctx, 1);
1435     mov(cb, REG1, stack_top);
1436
1437     // Write the value at the environment pointer
1438     const int32_t offs = -8 * slot_idx;
1439     mov(cb, mem_opnd(64, REG0, offs), REG1);
1440
1441     return YJIT_KEEP_COMPILING;
1442 }
1443
1444 // Push Qtrue or Qfalse depending on whether the given keyword was supplied by
1445 // the caller
1446 static codegen_status_t
1447 gen_checkkeyword(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1448 {
1449     // When a keyword is unspecified past index 32, a hash will be used
1450     // instead. This can only happen in iseqs taking more than 32 keywords.
1451     if (jit->iseq->body->param.keyword->num >= 32) {
1452         return YJIT_CANT_COMPILE;
1453     }
1454
1455     // The EP offset to the undefined bits local
1456     int32_t bits_offset = (int32_t)jit_get_arg(jit, 0);
1457
1458     // The index of the keyword we want to check
1459     int32_t index = (int32_t)jit_get_arg(jit, 1);
1460
1461     // Load environment pointer EP
1462     gen_get_ep(cb, REG0, 0);
1463
1464     // VALUE kw_bits = *(ep - bits);
1465     x86opnd_t bits_opnd = mem_opnd(64, REG0, sizeof(VALUE) * -bits_offset);
1466
1467     // unsigned int b = (unsigned int)FIX2ULONG(kw_bits);
1468     // if ((b & (0x01 << idx))) {
1469     //
1470     // We can skip the FIX2ULONG conversion by shifting the bit we test
1471     int64_t bit_test = 0x01 << (index + 1);
1472     test(cb, bits_opnd, imm_opnd(bit_test));
1473     mov(cb, REG0, imm_opnd(Qfalse));
1474     mov(cb, REG1, imm_opnd(Qtrue));
1475     cmovz(cb, REG0, REG1);
1476
1477     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_IMM);
1478     mov(cb, stack_ret, REG0);
1479
1480     return YJIT_KEEP_COMPILING;
1481 }
1482
1483 static codegen_status_t
1484 gen_setlocal_generic(jitstate_t *jit, ctx_t *ctx, uint32_t local_idx, uint32_t level)
1485 {
1486     // Load environment pointer EP at level
1487     gen_get_ep(cb, REG0, level);
1488
1489     // flags & VM_ENV_FLAG_WB_REQUIRED
1490     x86opnd_t flags_opnd = mem_opnd(64, REG0, sizeof(VALUE) * VM_ENV_DATA_INDEX_FLAGS);
1491     test(cb, flags_opnd, imm_opnd(VM_ENV_FLAG_WB_REQUIRED));
1492
1493     // Create a side-exit to fall back to the interpreter
1494     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1495
1496     // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
1497     jnz_ptr(cb, side_exit);
1498
1499     // Pop the value to write from the stack
1500     x86opnd_t stack_top = ctx_stack_pop(ctx, 1);
1501     mov(cb, REG1, stack_top);
1502
1503     // Write the value at the environment pointer
1504     const int32_t offs = -(int32_t)(SIZEOF_VALUE * local_idx);
1505     mov(cb, mem_opnd(64, REG0, offs), REG1);
1506
1507     return YJIT_KEEP_COMPILING;
1508 }
1509
1510 static codegen_status_t
1511 gen_setlocal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1512 {
1513     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1514     int32_t level = (int32_t)jit_get_arg(jit, 1);
1515     return gen_setlocal_generic(jit, ctx, idx, level);
1516 }
1517
1518 static codegen_status_t
1519 gen_setlocal_wc1(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1520 {
1521     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1522     return gen_setlocal_generic(jit, ctx, idx, 1);
1523 }
1524
1525 static void
1526 gen_jnz_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
1527 {
1528     switch (shape) {
1529       case SHAPE_NEXT0:
1530       case SHAPE_NEXT1:
1531         RUBY_ASSERT(false);
1532         break;
1533
1534       case SHAPE_DEFAULT:
1535         jnz_ptr(cb, target0);
1536         break;
1537     }
1538 }
1539
1540 static void
1541 gen_jz_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
1542 {
1543     switch (shape) {
1544       case SHAPE_NEXT0:
1545       case SHAPE_NEXT1:
1546         RUBY_ASSERT(false);
1547         break;
1548
1549       case SHAPE_DEFAULT:
1550         jz_ptr(cb, target0);
1551         break;
1552     }
1553 }
1554
1555 static void
1556 gen_jbe_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
1557 {
1558     switch (shape) {
1559       case SHAPE_NEXT0:
1560       case SHAPE_NEXT1:
1561         RUBY_ASSERT(false);
1562         break;
1563
1564       case SHAPE_DEFAULT:
1565         jbe_ptr(cb, target0);
1566         break;
1567     }
1568 }
1569
1570 enum jcc_kinds {
1571     JCC_JNE,
1572     JCC_JNZ,
1573     JCC_JZ,
1574     JCC_JE,
1575     JCC_JBE,
1576     JCC_JNA,
1577 };
1578
1579 // Generate a jump to a stub that recompiles the current YARV instruction on failure.
1580 // When depth_limitk is exceeded, generate a jump to a side exit.
1581 static void
1582 jit_chain_guard(enum jcc_kinds jcc, jitstate_t *jit, const ctx_t *ctx, uint8_t depth_limit, uint8_t *side_exit)
1583 {
1584     branchgen_fn target0_gen_fn;
1585
1586     switch (jcc) {
1587       case JCC_JNE:
1588       case JCC_JNZ:
1589         target0_gen_fn = gen_jnz_to_target0;
1590         break;
1591       case JCC_JZ:
1592       case JCC_JE:
1593         target0_gen_fn = gen_jz_to_target0;
1594         break;
1595       case JCC_JBE:
1596       case JCC_JNA:
1597         target0_gen_fn = gen_jbe_to_target0;
1598         break;
1599       default:
1600         rb_bug("yjit: unimplemented jump kind");
1601         break;
1602     };
1603
1604     if (ctx->chain_depth < depth_limit) {
1605         ctx_t deeper = *ctx;
1606         deeper.chain_depth++;
1607
1608         gen_branch(
1609             jit,
1610             ctx,
1611             (blockid_t) { jit->iseq, jit->insn_idx },
1612             &deeper,
1613             BLOCKID_NULL,
1614             NULL,
1615             target0_gen_fn
1616         );
1617     }
1618     else {
1619         target0_gen_fn(cb, side_exit, NULL, SHAPE_DEFAULT);
1620     }
1621 }
1622
1623 enum {
1624     GETIVAR_MAX_DEPTH = 10,       // up to 5 different classes, and embedded or not for each
1625     OPT_AREF_MAX_CHAIN_DEPTH = 2, // hashes and arrays
1626     SEND_MAX_DEPTH = 5,           // up to 5 different classes
1627 };
1628
1629 VALUE rb_vm_set_ivar_idx(VALUE obj, uint32_t idx, VALUE val);
1630
1631 // Codegen for setting an instance variable.
1632 // Preconditions:
1633 //   - receiver is in REG0
1634 //   - receiver has the same class as CLASS_OF(comptime_receiver)
1635 //   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
1636 static codegen_status_t
1637 gen_set_ivar(jitstate_t *jit, ctx_t *ctx, VALUE recv, VALUE klass, ID ivar_name)
1638 {
1639     // Save the PC and SP because the callee may allocate
1640     // Note that this modifies REG_SP, which is why we do it first
1641     jit_prepare_routine_call(jit, ctx, REG0);
1642
1643     // Get the operands from the stack
1644     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
1645     x86opnd_t recv_opnd = ctx_stack_pop(ctx, 1);
1646
1647     uint32_t ivar_index = rb_obj_ensure_iv_index_mapping(recv, ivar_name);
1648
1649     // Call rb_vm_set_ivar_idx with the receiver, the index of the ivar, and the value
1650     mov(cb, C_ARG_REGS[0], recv_opnd);
1651     mov(cb, C_ARG_REGS[1], imm_opnd(ivar_index));
1652     mov(cb, C_ARG_REGS[2], val_opnd);
1653     call_ptr(cb, REG0, (void *)rb_vm_set_ivar_idx);
1654
1655     x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1656     mov(cb, out_opnd, RAX);
1657
1658     return YJIT_KEEP_COMPILING;
1659 }
1660
1661 // Codegen for getting an instance variable.
1662 // Preconditions:
1663 //   - receiver is in REG0
1664 //   - receiver has the same class as CLASS_OF(comptime_receiver)
1665 //   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
1666 static codegen_status_t
1667 gen_get_ivar(jitstate_t *jit, ctx_t *ctx, const int max_chain_depth, VALUE comptime_receiver, ID ivar_name, insn_opnd_t reg0_opnd, uint8_t *side_exit)
1668 {
1669     VALUE comptime_val_klass = CLASS_OF(comptime_receiver);
1670     const ctx_t starting_context = *ctx; // make a copy for use with jit_chain_guard
1671
1672     // If the class uses the default allocator, instances should all be T_OBJECT
1673     // NOTE: This assumes nobody changes the allocator of the class after allocation.
1674     //       Eventually, we can encode whether an object is T_OBJECT or not
1675     //       inside object shapes.
1676     if (!RB_TYPE_P(comptime_receiver, T_OBJECT) ||
1677             rb_get_alloc_func(comptime_val_klass) != rb_class_allocate_instance) {
1678         // General case. Call rb_ivar_get().
1679         // VALUE rb_ivar_get(VALUE obj, ID id)
1680         ADD_COMMENT(cb, "call rb_ivar_get()");
1681
1682         // The function could raise exceptions.
1683         jit_prepare_routine_call(jit, ctx, REG1);
1684
1685         mov(cb, C_ARG_REGS[0], REG0);
1686         mov(cb, C_ARG_REGS[1], imm_opnd((int64_t)ivar_name));
1687         call_ptr(cb, REG1, (void *)rb_ivar_get);
1688
1689         if (!reg0_opnd.is_self) {
1690             (void)ctx_stack_pop(ctx, 1);
1691         }
1692         // Push the ivar on the stack
1693         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1694         mov(cb, out_opnd, RAX);
1695
1696         // Jump to next instruction. This allows guard chains to share the same successor.
1697         jit_jump_to_next_insn(jit, ctx);
1698         return YJIT_END_BLOCK;
1699     }
1700
1701     /*
1702     // FIXME:
1703     // This check was added because of a failure in a test involving the
1704     // Nokogiri Document class where we see a T_DATA that still has the default
1705     // allocator.
1706     // Aaron Patterson argues that this is a bug in the C extension, because
1707     // people could call .allocate() on the class and still get a T_OBJECT
1708     // For now I added an extra dynamic check that the receiver is T_OBJECT
1709     // so we can safely pass all the tests in Shopify Core.
1710     //
1711     // Guard that the receiver is T_OBJECT
1712     // #define RB_BUILTIN_TYPE(x) (int)(((struct RBasic*)(x))->flags & RUBY_T_MASK)
1713     ADD_COMMENT(cb, "guard receiver is T_OBJECT");
1714     mov(cb, REG1, member_opnd(REG0, struct RBasic, flags));
1715     and(cb, REG1, imm_opnd(RUBY_T_MASK));
1716     cmp(cb, REG1, imm_opnd(T_OBJECT));
1717     jit_chain_guard(JCC_JNE, jit, &starting_context, max_chain_depth, side_exit);
1718     */
1719
1720     // FIXME: Mapping the index could fail when there is too many ivar names. If we're
1721     // compiling for a branch stub that can cause the exception to be thrown from the
1722     // wrong PC.
1723     uint32_t ivar_index = rb_obj_ensure_iv_index_mapping(comptime_receiver, ivar_name);
1724
1725     // Pop receiver if it's on the temp stack
1726     if (!reg0_opnd.is_self) {
1727         (void)ctx_stack_pop(ctx, 1);
1728     }
1729
1730     // Compile time self is embedded and the ivar index lands within the object
1731     if (RB_FL_TEST_RAW(comptime_receiver, ROBJECT_EMBED) && ivar_index < ROBJECT_EMBED_LEN_MAX) {
1732         // See ROBJECT_IVPTR() from include/ruby/internal/core/robject.h
1733
1734         // Guard that self is embedded
1735         // TODO: BT and JC is shorter
1736         ADD_COMMENT(cb, "guard embedded getivar");
1737         x86opnd_t flags_opnd = member_opnd(REG0, struct RBasic, flags);
1738         test(cb, flags_opnd, imm_opnd(ROBJECT_EMBED));
1739         jit_chain_guard(JCC_JZ, jit, &starting_context, max_chain_depth, COUNTED_EXIT(jit, side_exit, getivar_megamorphic));
1740
1741         // Load the variable
1742         x86opnd_t ivar_opnd = mem_opnd(64, REG0, offsetof(struct RObject, as.ary) + ivar_index * SIZEOF_VALUE);
1743         mov(cb, REG1, ivar_opnd);
1744
1745         // Guard that the variable is not Qundef
1746         cmp(cb, REG1, imm_opnd(Qundef));
1747         mov(cb, REG0, imm_opnd(Qnil));
1748         cmove(cb, REG1, REG0);
1749
1750         // Push the ivar on the stack
1751         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1752         mov(cb, out_opnd, REG1);
1753     }
1754     else {
1755         // Compile time value is *not* embedded.
1756
1757         // Guard that value is *not* embedded
1758         // See ROBJECT_IVPTR() from include/ruby/internal/core/robject.h
1759         ADD_COMMENT(cb, "guard extended getivar");
1760         x86opnd_t flags_opnd = member_opnd(REG0, struct RBasic, flags);
1761         test(cb, flags_opnd, imm_opnd(ROBJECT_EMBED));
1762         jit_chain_guard(JCC_JNZ, jit, &starting_context, max_chain_depth, COUNTED_EXIT(jit, side_exit, getivar_megamorphic));
1763
1764         // check that the extended table is big enough
1765         if (ivar_index >= ROBJECT_EMBED_LEN_MAX + 1) {
1766             // Check that the slot is inside the extended table (num_slots > index)
1767             x86opnd_t num_slots = mem_opnd(32, REG0, offsetof(struct RObject, as.heap.numiv));
1768             cmp(cb, num_slots, imm_opnd(ivar_index));
1769             jle_ptr(cb, COUNTED_EXIT(jit, side_exit, getivar_idx_out_of_range));
1770         }
1771
1772         // Get a pointer to the extended table
1773         x86opnd_t tbl_opnd = mem_opnd(64, REG0, offsetof(struct RObject, as.heap.ivptr));
1774         mov(cb, REG0, tbl_opnd);
1775
1776         // Read the ivar from the extended table
1777         x86opnd_t ivar_opnd = mem_opnd(64, REG0, sizeof(VALUE) * ivar_index);
1778         mov(cb, REG0, ivar_opnd);
1779
1780         // Check that the ivar is not Qundef
1781         cmp(cb, REG0, imm_opnd(Qundef));
1782         mov(cb, REG1, imm_opnd(Qnil));
1783         cmove(cb, REG0, REG1);
1784
1785         // Push the ivar on the stack
1786         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1787         mov(cb, out_opnd, REG0);
1788     }
1789
1790     // Jump to next instruction. This allows guard chains to share the same successor.
1791     jit_jump_to_next_insn(jit, ctx);
1792     return YJIT_END_BLOCK;
1793 }
1794
1795 static codegen_status_t
1796 gen_getinstancevariable(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1797 {
1798     // Defer compilation so we can specialize on a runtime `self`
1799     if (!jit_at_current_insn(jit)) {
1800         defer_compilation(jit, ctx);
1801         return YJIT_END_BLOCK;
1802     }
1803
1804     ID ivar_name = (ID)jit_get_arg(jit, 0);
1805
1806     VALUE comptime_val = jit_peek_at_self(jit, ctx);
1807     VALUE comptime_val_klass = CLASS_OF(comptime_val);
1808
1809     // Generate a side exit
1810     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1811
1812     // Guard that the receiver has the same class as the one from compile time.
1813     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, self));
1814
1815     jit_guard_known_klass(jit, ctx, comptime_val_klass, OPND_SELF, comptime_val, GETIVAR_MAX_DEPTH, side_exit);
1816
1817     return gen_get_ivar(jit, ctx, GETIVAR_MAX_DEPTH, comptime_val, ivar_name, OPND_SELF, side_exit);
1818 }
1819
1820 void rb_vm_setinstancevariable(const rb_iseq_t *iseq, VALUE obj, ID id, VALUE val, IVC ic);
1821
1822 static codegen_status_t
1823 gen_setinstancevariable(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1824 {
1825     ID id = (ID)jit_get_arg(jit, 0);
1826     IVC ic = (IVC)jit_get_arg(jit, 1);
1827
1828     // Save the PC and SP because the callee may allocate
1829     // Note that this modifies REG_SP, which is why we do it first
1830     jit_prepare_routine_call(jit, ctx, REG0);
1831
1832     // Get the operands from the stack
1833     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
1834
1835     // Call rb_vm_setinstancevariable(iseq, obj, id, val, ic);
1836     mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, self));
1837     mov(cb, C_ARG_REGS[3], val_opnd);
1838     mov(cb, C_ARG_REGS[2], imm_opnd(id));
1839     mov(cb, C_ARG_REGS[4], const_ptr_opnd(ic));
1840     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], (VALUE)jit->iseq);
1841     call_ptr(cb, REG0, (void *)rb_vm_setinstancevariable);
1842
1843     return YJIT_KEEP_COMPILING;
1844 }
1845
1846 bool rb_vm_defined(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, rb_num_t op_type, VALUE obj, VALUE v);
1847
1848 static codegen_status_t
1849 gen_defined(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1850 {
1851     rb_num_t op_type = (rb_num_t)jit_get_arg(jit, 0);
1852     VALUE obj = (VALUE)jit_get_arg(jit, 1);
1853     VALUE pushval = (VALUE)jit_get_arg(jit, 2);
1854
1855     // Save the PC and SP because the callee may allocate
1856     // Note that this modifies REG_SP, which is why we do it first
1857     jit_prepare_routine_call(jit, ctx, REG0);
1858
1859     // Get the operands from the stack
1860     x86opnd_t v_opnd = ctx_stack_pop(ctx, 1);
1861
1862     // Call vm_defined(ec, reg_cfp, op_type, obj, v)
1863     mov(cb, C_ARG_REGS[0], REG_EC);
1864     mov(cb, C_ARG_REGS[1], REG_CFP);
1865     mov(cb, C_ARG_REGS[2], imm_opnd(op_type));
1866     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[3], (VALUE)obj);
1867     mov(cb, C_ARG_REGS[4], v_opnd);
1868     call_ptr(cb, REG0, (void *)rb_vm_defined);
1869
1870     // if (vm_defined(ec, GET_CFP(), op_type, obj, v)) {
1871     //  val = pushval;
1872     // }
1873     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)pushval);
1874     cmp(cb, AL, imm_opnd(0));
1875     mov(cb, RAX, imm_opnd(Qnil));
1876     cmovnz(cb, RAX, REG1);
1877
1878     // Push the return value onto the stack
1879     val_type_t out_type = SPECIAL_CONST_P(pushval)? TYPE_IMM:TYPE_UNKNOWN;
1880     x86opnd_t stack_ret = ctx_stack_push(ctx, out_type);
1881     mov(cb, stack_ret, RAX);
1882
1883     return YJIT_KEEP_COMPILING;
1884 }
1885
1886 static codegen_status_t
1887 gen_checktype(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1888 {
1889     enum ruby_value_type type_val = (enum ruby_value_type)jit_get_arg(jit, 0);
1890     // Only three types are emitted by compile.c
1891     if (type_val == T_STRING || type_val == T_ARRAY || type_val == T_HASH) {
1892         val_type_t val_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1893         x86opnd_t val = ctx_stack_pop(ctx, 1);
1894
1895         x86opnd_t stack_ret;
1896
1897         // Check if we know from type information
1898         if ((type_val == T_STRING && val_type.type == ETYPE_STRING) ||
1899                 (type_val == T_ARRAY && val_type.type == ETYPE_ARRAY) ||
1900                 (type_val == T_HASH && val_type.type == ETYPE_HASH)) {
1901             // guaranteed type match
1902             stack_ret = ctx_stack_push(ctx, TYPE_TRUE);
1903             mov(cb, stack_ret, imm_opnd(Qtrue));
1904             return YJIT_KEEP_COMPILING;
1905         }
1906         else if (val_type.is_imm || val_type.type != ETYPE_UNKNOWN) {
1907             // guaranteed not to match T_STRING/T_ARRAY/T_HASH
1908             stack_ret = ctx_stack_push(ctx, TYPE_FALSE);
1909             mov(cb, stack_ret, imm_opnd(Qfalse));
1910             return YJIT_KEEP_COMPILING;
1911         }
1912
1913         mov(cb, REG0, val);
1914         mov(cb, REG1, imm_opnd(Qfalse));
1915
1916         uint32_t ret = cb_new_label(cb, "ret");
1917
1918         if (!val_type.is_heap) {
1919             // if (SPECIAL_CONST_P(val)) {
1920             // Return Qfalse via REG1 if not on heap
1921             test(cb, REG0, imm_opnd(RUBY_IMMEDIATE_MASK));
1922             jnz_label(cb, ret);
1923             cmp(cb, REG0, imm_opnd(Qnil));
1924             jbe_label(cb, ret);
1925         }
1926
1927         // Check type on object
1928         mov(cb, REG0, mem_opnd(64, REG0, offsetof(struct RBasic, flags)));
1929         and(cb, REG0, imm_opnd(RUBY_T_MASK));
1930         cmp(cb, REG0, imm_opnd(type_val));
1931         mov(cb, REG0, imm_opnd(Qtrue));
1932         // REG1 contains Qfalse from above
1933         cmove(cb, REG1, REG0);
1934
1935         cb_write_label(cb, ret);
1936         stack_ret = ctx_stack_push(ctx, TYPE_IMM);
1937         mov(cb, stack_ret, REG1);
1938         cb_link_labels(cb);
1939
1940         return YJIT_KEEP_COMPILING;
1941     }
1942     else {
1943         return YJIT_CANT_COMPILE;
1944     }
1945 }
1946
1947 static codegen_status_t
1948 gen_concatstrings(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1949 {
1950     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
1951
1952     // Save the PC and SP because we are allocating
1953     jit_prepare_routine_call(jit, ctx, REG0);
1954
1955     x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(int32_t)(sizeof(VALUE) * (uint32_t)n));
1956
1957     // call rb_str_concat_literals(long n, const VALUE *strings);
1958     mov(cb, C_ARG_REGS[0], imm_opnd(n));
1959     lea(cb, C_ARG_REGS[1], values_ptr);
1960     call_ptr(cb, REG0, (void *)rb_str_concat_literals);
1961
1962     ctx_stack_pop(ctx, n);
1963     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
1964     mov(cb, stack_ret, RAX);
1965
1966     return YJIT_KEEP_COMPILING;
1967 }
1968
1969 static void
1970 guard_two_fixnums(ctx_t *ctx, uint8_t *side_exit)
1971 {
1972     // Get the stack operand types
1973     val_type_t arg1_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1974     val_type_t arg0_type = ctx_get_opnd_type(ctx, OPND_STACK(1));
1975
1976     if (arg0_type.is_heap || arg1_type.is_heap) {
1977         jmp_ptr(cb, side_exit);
1978         return;
1979     }
1980
1981     if (arg0_type.type != ETYPE_FIXNUM && arg0_type.type != ETYPE_UNKNOWN) {
1982         jmp_ptr(cb, side_exit);
1983         return;
1984     }
1985
1986     if (arg1_type.type != ETYPE_FIXNUM && arg1_type.type != ETYPE_UNKNOWN) {
1987         jmp_ptr(cb, side_exit);
1988         return;
1989     }
1990
1991     RUBY_ASSERT(!arg0_type.is_heap);
1992     RUBY_ASSERT(!arg1_type.is_heap);
1993     RUBY_ASSERT(arg0_type.type == ETYPE_FIXNUM || arg0_type.type == ETYPE_UNKNOWN);
1994     RUBY_ASSERT(arg1_type.type == ETYPE_FIXNUM || arg1_type.type == ETYPE_UNKNOWN);
1995
1996     // Get stack operands without popping them
1997     x86opnd_t arg1 = ctx_stack_opnd(ctx, 0);
1998     x86opnd_t arg0 = ctx_stack_opnd(ctx, 1);
1999
2000     // If not fixnums, fall back
2001     if (arg0_type.type != ETYPE_FIXNUM) {
2002         ADD_COMMENT(cb, "guard arg0 fixnum");
2003         test(cb, arg0, imm_opnd(RUBY_FIXNUM_FLAG));
2004         jz_ptr(cb, side_exit);
2005     }
2006     if (arg1_type.type != ETYPE_FIXNUM) {
2007         ADD_COMMENT(cb, "guard arg1 fixnum");
2008         test(cb, arg1, imm_opnd(RUBY_FIXNUM_FLAG));
2009         jz_ptr(cb, side_exit);
2010     }
2011
2012     // Set stack types in context
2013     ctx_upgrade_opnd_type(ctx, OPND_STACK(0), TYPE_FIXNUM);
2014     ctx_upgrade_opnd_type(ctx, OPND_STACK(1), TYPE_FIXNUM);
2015 }
2016
2017 // Conditional move operation used by comparison operators
2018 typedef void (*cmov_fn)(codeblock_t *cb, x86opnd_t opnd0, x86opnd_t opnd1);
2019
2020 static codegen_status_t
2021 gen_fixnum_cmp(jitstate_t *jit, ctx_t *ctx, cmov_fn cmov_op)
2022 {
2023     // Defer compilation so we can specialize base on a runtime receiver
2024     if (!jit_at_current_insn(jit)) {
2025         defer_compilation(jit, ctx);
2026         return YJIT_END_BLOCK;
2027     }
2028
2029     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2030     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2031
2032     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2033         // Create a side-exit to fall back to the interpreter
2034         // Note: we generate the side-exit before popping operands from the stack
2035         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2036
2037         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_LT)) {
2038             return YJIT_CANT_COMPILE;
2039         }
2040
2041         // Check that both operands are fixnums
2042         guard_two_fixnums(ctx, side_exit);
2043
2044         // Get the operands from the stack
2045         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2046         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2047
2048         // Compare the arguments
2049         xor(cb, REG0_32, REG0_32); // REG0 = Qfalse
2050         mov(cb, REG1, arg0);
2051         cmp(cb, REG1, arg1);
2052         mov(cb, REG1, imm_opnd(Qtrue));
2053         cmov_op(cb, REG0, REG1);
2054
2055         // Push the output on the stack
2056         x86opnd_t dst = ctx_stack_push(ctx, TYPE_UNKNOWN);
2057         mov(cb, dst, REG0);
2058
2059         return YJIT_KEEP_COMPILING;
2060     }
2061     else {
2062         return gen_opt_send_without_block(jit, ctx, cb);
2063     }
2064 }
2065
2066 static codegen_status_t
2067 gen_opt_lt(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2068 {
2069     return gen_fixnum_cmp(jit, ctx, cmovl);
2070 }
2071
2072 static codegen_status_t
2073 gen_opt_le(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2074 {
2075     return gen_fixnum_cmp(jit, ctx, cmovle);
2076 }
2077
2078 static codegen_status_t
2079 gen_opt_ge(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2080 {
2081     return gen_fixnum_cmp(jit, ctx, cmovge);
2082 }
2083
2084 static codegen_status_t
2085 gen_opt_gt(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2086 {
2087     return gen_fixnum_cmp(jit, ctx, cmovg);
2088 }
2089
2090 // Implements specialized equality for either two fixnum or two strings
2091 // Returns true if code was generated, otherwise false
2092 static bool
2093 gen_equality_specialized(jitstate_t *jit, ctx_t *ctx, uint8_t *side_exit)
2094 {
2095     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2096     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2097
2098     x86opnd_t a_opnd = ctx_stack_opnd(ctx, 1);
2099     x86opnd_t b_opnd = ctx_stack_opnd(ctx, 0);
2100
2101     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2102         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_EQ)) {
2103             // if overridden, emit the generic version
2104             return false;
2105         }
2106
2107         guard_two_fixnums(ctx, side_exit);
2108
2109         mov(cb, REG0, a_opnd);
2110         cmp(cb, REG0, b_opnd);
2111
2112         mov(cb, REG0, imm_opnd(Qfalse));
2113         mov(cb, REG1, imm_opnd(Qtrue));
2114         cmove(cb, REG0, REG1);
2115
2116         // Push the output on the stack
2117         ctx_stack_pop(ctx, 2);
2118         x86opnd_t dst = ctx_stack_push(ctx, TYPE_IMM);
2119         mov(cb, dst, REG0);
2120
2121         return true;
2122     }
2123     else if (CLASS_OF(comptime_a) == rb_cString &&
2124             CLASS_OF(comptime_b) == rb_cString) {
2125         if (!assume_bop_not_redefined(jit, STRING_REDEFINED_OP_FLAG, BOP_EQ)) {
2126             // if overridden, emit the generic version
2127             return false;
2128         }
2129
2130         // Load a and b in preparation for call later
2131         mov(cb, C_ARG_REGS[0], a_opnd);
2132         mov(cb, C_ARG_REGS[1], b_opnd);
2133
2134         // Guard that a is a String
2135         mov(cb, REG0, C_ARG_REGS[0]);
2136         jit_guard_known_klass(jit, ctx, rb_cString, OPND_STACK(1), comptime_a, SEND_MAX_DEPTH, side_exit);
2137
2138         uint32_t ret = cb_new_label(cb, "ret");
2139
2140         // If they are equal by identity, return true
2141         cmp(cb, C_ARG_REGS[0], C_ARG_REGS[1]);
2142         mov(cb, RAX, imm_opnd(Qtrue));
2143         je_label(cb, ret);
2144
2145         // Otherwise guard that b is a T_STRING (from type info) or String (from runtime guard)
2146         if (ctx_get_opnd_type(ctx, OPND_STACK(0)).type != ETYPE_STRING) {
2147             mov(cb, REG0, C_ARG_REGS[1]);
2148             // Note: any T_STRING is valid here, but we check for a ::String for simplicity
2149             jit_guard_known_klass(jit, ctx, rb_cString, OPND_STACK(0), comptime_b, SEND_MAX_DEPTH, side_exit);
2150         }
2151
2152         // Call rb_str_eql_internal(a, b)
2153         call_ptr(cb, REG0, (void *)rb_str_eql_internal);
2154
2155         // Push the output on the stack
2156         cb_write_label(cb, ret);
2157         ctx_stack_pop(ctx, 2);
2158         x86opnd_t dst = ctx_stack_push(ctx, TYPE_IMM);
2159         mov(cb, dst, RAX);
2160         cb_link_labels(cb);
2161
2162         return true;
2163     }
2164     else {
2165         return false;
2166     }
2167 }
2168
2169 static codegen_status_t
2170 gen_opt_eq(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2171 {
2172     // Defer compilation so we can specialize base on a runtime receiver
2173     if (!jit_at_current_insn(jit)) {
2174         defer_compilation(jit, ctx);
2175         return YJIT_END_BLOCK;
2176     }
2177
2178     // Create a side-exit to fall back to the interpreter
2179     uint8_t *side_exit = yjit_side_exit(jit, ctx);
2180
2181     if (gen_equality_specialized(jit, ctx, side_exit)) {
2182         jit_jump_to_next_insn(jit, ctx);
2183         return YJIT_END_BLOCK;
2184     }
2185     else {
2186         return gen_opt_send_without_block(jit, ctx, cb);
2187     }
2188 }
2189
2190 static codegen_status_t gen_send_general(jitstate_t *jit, ctx_t *ctx, struct rb_call_data *cd, rb_iseq_t *block);
2191
2192 static codegen_status_t
2193 gen_opt_neq(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2194 {
2195     // opt_neq is passed two rb_call_data as arguments:
2196     // first for ==, second for !=
2197     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 1);
2198     return gen_send_general(jit, ctx, cd, NULL);
2199 }
2200
2201 static codegen_status_t
2202 gen_opt_aref(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2203 {
2204     struct rb_call_data * cd = (struct rb_call_data *)jit_get_arg(jit, 0);
2205     int32_t argc = (int32_t)vm_ci_argc(cd->ci);
2206
2207     // Only JIT one arg calls like `ary[6]`
2208     if (argc != 1) {
2209         GEN_COUNTER_INC(cb, oaref_argc_not_one);
2210         return YJIT_CANT_COMPILE;
2211     }
2212
2213     // Defer compilation so we can specialize base on a runtime receiver
2214     if (!jit_at_current_insn(jit)) {
2215         defer_compilation(jit, ctx);
2216         return YJIT_END_BLOCK;
2217     }
2218
2219     // Remember the context on entry for adding guard chains
2220     const ctx_t starting_context = *ctx;
2221
2222     // Specialize base on compile time values
2223     VALUE comptime_idx = jit_peek_at_stack(jit, ctx, 0);
2224     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, 1);
2225
2226     // Create a side-exit to fall back to the interpreter
2227     uint8_t *side_exit = yjit_side_exit(jit, ctx);
2228
2229     if (CLASS_OF(comptime_recv) == rb_cArray && RB_FIXNUM_P(comptime_idx)) {
2230         if (!assume_bop_not_redefined(jit, ARRAY_REDEFINED_OP_FLAG, BOP_AREF)) {
2231             return YJIT_CANT_COMPILE;
2232         }
2233
2234         // Pop the stack operands
2235         x86opnd_t idx_opnd = ctx_stack_pop(ctx, 1);
2236         x86opnd_t recv_opnd = ctx_stack_pop(ctx, 1);
2237         mov(cb, REG0, recv_opnd);
2238
2239         // if (SPECIAL_CONST_P(recv)) {
2240         // Bail if receiver is not a heap object
2241         test(cb, REG0, imm_opnd(RUBY_IMMEDIATE_MASK));
2242         jnz_ptr(cb, side_exit);
2243         cmp(cb, REG0, imm_opnd(Qfalse));
2244         je_ptr(cb, side_exit);
2245         cmp(cb, REG0, imm_opnd(Qnil));
2246         je_ptr(cb, side_exit);
2247
2248         // Bail if recv has a class other than ::Array.
2249         // BOP_AREF check above is only good for ::Array.
2250         mov(cb, REG1, mem_opnd(64, REG0, offsetof(struct RBasic, klass)));
2251         mov(cb, REG0, const_ptr_opnd((void *)rb_cArray));
2252         cmp(cb, REG0, REG1);
2253         jit_chain_guard(JCC_JNE, jit, &starting_context, OPT_AREF_MAX_CHAIN_DEPTH, side_exit);
2254
2255         // Bail if idx is not a FIXNUM
2256         mov(cb, REG1, idx_opnd);
2257         test(cb, REG1, imm_opnd(RUBY_FIXNUM_FLAG));
2258         jz_ptr(cb, COUNTED_EXIT(jit, side_exit, oaref_arg_not_fixnum));
2259
2260         // Call VALUE rb_ary_entry_internal(VALUE ary, long offset).
2261         // It never raises or allocates, so we don't need to write to cfp->pc.
2262         {
2263             mov(cb, RDI, recv_opnd);
2264             sar(cb, REG1, imm_opnd(1)); // Convert fixnum to int
2265             mov(cb, RSI, REG1);
2266             call_ptr(cb, REG0, (void *)rb_ary_entry_internal);
2267
2268             // Push the return value onto the stack
2269             x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2270             mov(cb, stack_ret, RAX);
2271         }
2272
2273         // Jump to next instruction. This allows guard chains to share the same successor.
2274         jit_jump_to_next_insn(jit, ctx);
2275         return YJIT_END_BLOCK;
2276     }
2277     else if (CLASS_OF(comptime_recv) == rb_cHash) {
2278         if (!assume_bop_not_redefined(jit, HASH_REDEFINED_OP_FLAG, BOP_AREF)) {
2279             return YJIT_CANT_COMPILE;
2280         }
2281
2282         x86opnd_t key_opnd = ctx_stack_opnd(ctx, 0);
2283         x86opnd_t recv_opnd = ctx_stack_opnd(ctx, 1);
2284
2285         // Guard that the receiver is a hash
2286         mov(cb, REG0, recv_opnd);
2287         jit_guard_known_klass(jit, ctx, rb_cHash, OPND_STACK(1), comptime_recv, OPT_AREF_MAX_CHAIN_DEPTH, side_exit);
2288
2289         // Setup arguments for rb_hash_aref().
2290         mov(cb, C_ARG_REGS[0], REG0);
2291         mov(cb, C_ARG_REGS[1], key_opnd);
2292
2293         // Prepare to call rb_hash_aref(). It might call #hash on the key.
2294         jit_prepare_routine_call(jit, ctx, REG0);
2295
2296         call_ptr(cb, REG0, (void *)rb_hash_aref);
2297
2298         // Pop the key and the receiver
2299         (void)ctx_stack_pop(ctx, 2);
2300
2301         // Push the return value onto the stack
2302         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2303         mov(cb, stack_ret, RAX);
2304
2305         // Jump to next instruction. This allows guard chains to share the same successor.
2306         jit_jump_to_next_insn(jit, ctx);
2307         return YJIT_END_BLOCK;
2308     }
2309     else {
2310         // General case. Call the [] method.
2311         return gen_opt_send_without_block(jit, ctx, cb);
2312     }
2313 }
2314
2315 static codegen_status_t
2316 gen_opt_aset(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2317 {
2318     // Defer compilation so we can specialize on a runtime `self`
2319     if (!jit_at_current_insn(jit)) {
2320         defer_compilation(jit, ctx);
2321         return YJIT_END_BLOCK;
2322     }
2323
2324     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, 2);
2325     VALUE comptime_key  = jit_peek_at_stack(jit, ctx, 1);
2326
2327     // Get the operands from the stack
2328     x86opnd_t recv = ctx_stack_opnd(ctx, 2);
2329     x86opnd_t key = ctx_stack_opnd(ctx, 1);
2330     x86opnd_t val = ctx_stack_opnd(ctx, 0);
2331
2332     if (CLASS_OF(comptime_recv) == rb_cArray && FIXNUM_P(comptime_key)) {
2333         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2334
2335         // Guard receiver is an Array
2336         mov(cb, REG0, recv);
2337         jit_guard_known_klass(jit, ctx, rb_cArray, OPND_STACK(2), comptime_recv, SEND_MAX_DEPTH, side_exit);
2338
2339         // Guard key is a fixnum
2340         mov(cb, REG0, key);
2341         jit_guard_known_klass(jit, ctx, rb_cInteger, OPND_STACK(1), comptime_key, SEND_MAX_DEPTH, side_exit);
2342
2343         // Call rb_ary_store
2344         mov(cb, C_ARG_REGS[0], recv);
2345         mov(cb, C_ARG_REGS[1], key);
2346         sar(cb, C_ARG_REGS[1], imm_opnd(1)); // FIX2LONG(key)
2347         mov(cb, C_ARG_REGS[2], val);
2348
2349         // We might allocate or raise
2350         jit_prepare_routine_call(jit, ctx, REG0);
2351
2352         call_ptr(cb, REG0, (void *)rb_ary_store);
2353
2354         // rb_ary_store returns void
2355         // stored value should still be on stack
2356         mov(cb, REG0, ctx_stack_opnd(ctx, 0));
2357
2358         // Push the return value onto the stack
2359         ctx_stack_pop(ctx, 3);
2360         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2361         mov(cb, stack_ret, REG0);
2362
2363         jit_jump_to_next_insn(jit, ctx);
2364         return YJIT_END_BLOCK;
2365     }
2366     else if (CLASS_OF(comptime_recv) == rb_cHash) {
2367         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2368
2369         // Guard receiver is a Hash
2370         mov(cb, REG0, recv);
2371         jit_guard_known_klass(jit, ctx, rb_cHash, OPND_STACK(2), comptime_recv, SEND_MAX_DEPTH, side_exit);
2372
2373         // Call rb_hash_aset
2374         mov(cb, C_ARG_REGS[0], recv);
2375         mov(cb, C_ARG_REGS[1], key);
2376         mov(cb, C_ARG_REGS[2], val);
2377
2378         // We might allocate or raise
2379         jit_prepare_routine_call(jit, ctx, REG0);
2380
2381         call_ptr(cb, REG0, (void *)rb_hash_aset);
2382
2383         // Push the return value onto the stack
2384         ctx_stack_pop(ctx, 3);
2385         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2386         mov(cb, stack_ret, RAX);
2387
2388         jit_jump_to_next_insn(jit, ctx);
2389         return YJIT_END_BLOCK;
2390     }
2391     else {
2392         return gen_opt_send_without_block(jit, ctx, cb);
2393     }
2394 }
2395
2396 static codegen_status_t
2397 gen_opt_and(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2398 {
2399     // Defer compilation so we can specialize on a runtime `self`
2400     if (!jit_at_current_insn(jit)) {
2401         defer_compilation(jit, ctx);
2402         return YJIT_END_BLOCK;
2403     }
2404
2405     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2406     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2407
2408     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2409         // Create a side-exit to fall back to the interpreter
2410         // Note: we generate the side-exit before popping operands from the stack
2411         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2412
2413         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_AND)) {
2414             return YJIT_CANT_COMPILE;
2415         }
2416
2417         // Check that both operands are fixnums
2418         guard_two_fixnums(ctx, side_exit);
2419
2420         // Get the operands and destination from the stack
2421         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2422         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2423
2424         // Do the bitwise and arg0 & arg1
2425         mov(cb, REG0, arg0);
2426         and(cb, REG0, arg1);
2427
2428         // Push the output on the stack
2429         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2430         mov(cb, dst, REG0);
2431
2432         return YJIT_KEEP_COMPILING;
2433     }
2434     else {
2435         // Delegate to send, call the method on the recv
2436         return gen_opt_send_without_block(jit, ctx, cb);
2437     }
2438 }
2439
2440 static codegen_status_t
2441 gen_opt_or(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2442 {
2443     // Defer compilation so we can specialize on a runtime `self`
2444     if (!jit_at_current_insn(jit)) {
2445         defer_compilation(jit, ctx);
2446         return YJIT_END_BLOCK;
2447     }
2448
2449     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2450     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2451
2452     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2453         // Create a side-exit to fall back to the interpreter
2454         // Note: we generate the side-exit before popping operands from the stack
2455         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2456
2457         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_OR)) {
2458             return YJIT_CANT_COMPILE;
2459         }
2460
2461         // Check that both operands are fixnums
2462         guard_two_fixnums(ctx, side_exit);
2463
2464         // Get the operands and destination from the stack
2465         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2466         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2467
2468         // Do the bitwise or arg0 | arg1
2469         mov(cb, REG0, arg0);
2470         or(cb, REG0, arg1);
2471
2472         // Push the output on the stack
2473         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2474         mov(cb, dst, REG0);
2475
2476         return YJIT_KEEP_COMPILING;
2477     }
2478     else {
2479         // Delegate to send, call the method on the recv
2480         return gen_opt_send_without_block(jit, ctx, cb);
2481     }
2482 }
2483
2484 static codegen_status_t
2485 gen_opt_minus(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2486 {
2487     // Defer compilation so we can specialize on a runtime `self`
2488     if (!jit_at_current_insn(jit)) {
2489         defer_compilation(jit, ctx);
2490         return YJIT_END_BLOCK;
2491     }
2492
2493     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2494     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2495
2496     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2497         // Create a side-exit to fall back to the interpreter
2498         // Note: we generate the side-exit before popping operands from the stack
2499         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2500
2501         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_MINUS)) {
2502             return YJIT_CANT_COMPILE;
2503         }
2504
2505         // Check that both operands are fixnums
2506         guard_two_fixnums(ctx, side_exit);
2507
2508         // Get the operands and destination from the stack
2509         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2510         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2511
2512         // Subtract arg0 - arg1 and test for overflow
2513         mov(cb, REG0, arg0);
2514         sub(cb, REG0, arg1);
2515         jo_ptr(cb, side_exit);
2516         add(cb, REG0, imm_opnd(1));
2517
2518         // Push the output on the stack
2519         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2520         mov(cb, dst, REG0);
2521
2522         return YJIT_KEEP_COMPILING;
2523     }
2524     else {
2525         // Delegate to send, call the method on the recv
2526         return gen_opt_send_without_block(jit, ctx, cb);
2527     }
2528 }
2529
2530 static codegen_status_t
2531 gen_opt_plus(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2532 {
2533     // Defer compilation so we can specialize on a runtime `self`
2534     if (!jit_at_current_insn(jit)) {
2535         defer_compilation(jit, ctx);
2536         return YJIT_END_BLOCK;
2537     }
2538
2539     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2540     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2541
2542     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2543         // Create a side-exit to fall back to the interpreter
2544         // Note: we generate the side-exit before popping operands from the stack
2545         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2546
2547         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_PLUS)) {
2548             return YJIT_CANT_COMPILE;
2549         }
2550
2551         // Check that both operands are fixnums
2552         guard_two_fixnums(ctx, side_exit);
2553
2554         // Get the operands and destination from the stack
2555         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2556         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2557
2558         // Add arg0 + arg1 and test for overflow
2559         mov(cb, REG0, arg0);
2560         sub(cb, REG0, imm_opnd(1));
2561         add(cb, REG0, arg1);
2562         jo_ptr(cb, side_exit);
2563
2564         // Push the output on the stack
2565         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2566         mov(cb, dst, REG0);
2567
2568         return YJIT_KEEP_COMPILING;
2569     }
2570     else {
2571         // Delegate to send, call the method on the recv
2572         return gen_opt_send_without_block(jit, ctx, cb);
2573     }
2574 }
2575
2576 static codegen_status_t
2577 gen_opt_mult(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2578 {
2579     // Delegate to send, call the method on the recv
2580     return gen_opt_send_without_block(jit, ctx, cb);
2581 }
2582
2583 static codegen_status_t
2584 gen_opt_div(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2585 {
2586     // Delegate to send, call the method on the recv
2587     return gen_opt_send_without_block(jit, ctx, cb);
2588 }
2589
2590 VALUE rb_vm_opt_mod(VALUE recv, VALUE obj);
2591
2592 static codegen_status_t
2593 gen_opt_mod(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2594 {
2595     // Save the PC and SP because the callee may allocate bignums
2596     // Note that this modifies REG_SP, which is why we do it first
2597     jit_prepare_routine_call(jit, ctx, REG0);
2598
2599     uint8_t *side_exit = yjit_side_exit(jit, ctx);
2600
2601     // Get the operands from the stack
2602     x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2603     x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2604
2605     // Call rb_vm_opt_mod(VALUE recv, VALUE obj)
2606     mov(cb, C_ARG_REGS[0], arg0);
2607     mov(cb, C_ARG_REGS[1], arg1);
2608     call_ptr(cb, REG0, (void *)rb_vm_opt_mod);
2609
2610     // If val == Qundef, bail to do a method call
2611     cmp(cb, RAX, imm_opnd(Qundef));
2612     je_ptr(cb, side_exit);
2613
2614     // Push the return value onto the stack
2615     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2616     mov(cb, stack_ret, RAX);
2617
2618     return YJIT_KEEP_COMPILING;
2619 }
2620
2621 static codegen_status_t
2622 gen_opt_ltlt(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2623 {
2624     // Delegate to send, call the method on the recv
2625     return gen_opt_send_without_block(jit, ctx, cb);
2626 }
2627
2628 static codegen_status_t
2629 gen_opt_nil_p(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2630 {
2631     // Delegate to send, call the method on the recv
2632     return gen_opt_send_without_block(jit, ctx, cb);
2633 }
2634
2635 static codegen_status_t
2636 gen_opt_empty_p(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2637 {
2638     // Delegate to send, call the method on the recv
2639     return gen_opt_send_without_block(jit, ctx, cb);
2640 }
2641
2642 static codegen_status_t
2643 gen_opt_str_freeze(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2644 {
2645     if (!assume_bop_not_redefined(jit, STRING_REDEFINED_OP_FLAG, BOP_FREEZE)) {
2646         return YJIT_CANT_COMPILE;
2647     }
2648
2649     VALUE str = jit_get_arg(jit, 0);
2650     jit_mov_gc_ptr(jit, cb, REG0, str);
2651
2652     // Push the return value onto the stack
2653     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
2654     mov(cb, stack_ret, REG0);
2655
2656     return YJIT_KEEP_COMPILING;
2657 }
2658
2659 static codegen_status_t
2660 gen_opt_str_uminus(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2661 {
2662     if (!assume_bop_not_redefined(jit, STRING_REDEFINED_OP_FLAG, BOP_UMINUS)) {
2663         return YJIT_CANT_COMPILE;
2664     }
2665
2666     VALUE str = jit_get_arg(jit, 0);
2667     jit_mov_gc_ptr(jit, cb, REG0, str);
2668
2669     // Push the return value onto the stack
2670     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
2671     mov(cb, stack_ret, REG0);
2672
2673     return YJIT_KEEP_COMPILING;
2674 }
2675
2676 static codegen_status_t
2677 gen_opt_not(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2678 {
2679     return gen_opt_send_without_block(jit, ctx, cb);
2680 }
2681
2682 static codegen_status_t
2683 gen_opt_size(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2684 {
2685     return gen_opt_send_without_block(jit, ctx, cb);
2686 }
2687
2688 static codegen_status_t
2689 gen_opt_length(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2690 {
2691     return gen_opt_send_without_block(jit, ctx, cb);
2692 }
2693
2694 static codegen_status_t
2695 gen_opt_regexpmatch2(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2696 {
2697     return gen_opt_send_without_block(jit, ctx, cb);
2698 }
2699
2700 static codegen_status_t
2701 gen_opt_case_dispatch(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2702 {
2703     // Normally this instruction would lookup the key in a hash and jump to an
2704     // offset based on that.
2705     // Instead we can take the fallback case and continue with the next
2706     // instruction.
2707     // We'd hope that our jitted code will be sufficiently fast without the
2708     // hash lookup, at least for small hashes, but it's worth revisiting this
2709     // assumption in the future.
2710
2711     ctx_stack_pop(ctx, 1);
2712
2713     return YJIT_KEEP_COMPILING; // continue with the next instruction
2714 }
2715
2716 static void
2717 gen_branchif_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
2718 {
2719     switch (shape) {
2720       case SHAPE_NEXT0:
2721         jz_ptr(cb, target1);
2722         break;
2723
2724       case SHAPE_NEXT1:
2725         jnz_ptr(cb, target0);
2726         break;
2727
2728       case SHAPE_DEFAULT:
2729         jnz_ptr(cb, target0);
2730         jmp_ptr(cb, target1);
2731         break;
2732     }
2733 }
2734
2735 static codegen_status_t
2736 gen_branchif(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2737 {
2738     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2739
2740     // Check for interrupts, but only on backward branches that may create loops
2741     if (jump_offset < 0) {
2742         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2743         yjit_check_ints(cb, side_exit);
2744     }
2745
2746     // Test if any bit (outside of the Qnil bit) is on
2747     // RUBY_Qfalse  /* ...0000 0000 */
2748     // RUBY_Qnil    /* ...0000 1000 */
2749     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
2750     test(cb, val_opnd, imm_opnd(~Qnil));
2751
2752     // Get the branch target instruction offsets
2753     uint32_t next_idx = jit_next_insn_idx(jit);
2754     uint32_t jump_idx = next_idx + jump_offset;
2755     blockid_t next_block = { jit->iseq, next_idx };
2756     blockid_t jump_block = { jit->iseq, jump_idx };
2757
2758     // Generate the branch instructions
2759     gen_branch(
2760         jit,
2761         ctx,
2762         jump_block,
2763         ctx,
2764         next_block,
2765         ctx,
2766         gen_branchif_branch
2767     );
2768
2769     return YJIT_END_BLOCK;
2770 }
2771
2772 static void
2773 gen_branchunless_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
2774 {
2775     switch (shape) {
2776       case SHAPE_NEXT0:
2777         jnz_ptr(cb, target1);
2778         break;
2779
2780       case SHAPE_NEXT1:
2781         jz_ptr(cb, target0);
2782         break;
2783
2784       case SHAPE_DEFAULT:
2785         jz_ptr(cb, target0);
2786         jmp_ptr(cb, target1);
2787         break;
2788     }
2789 }
2790
2791 static codegen_status_t
2792 gen_branchunless(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2793 {
2794     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2795
2796     // Check for interrupts, but only on backward branches that may create loops
2797     if (jump_offset < 0) {
2798         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2799         yjit_check_ints(cb, side_exit);
2800     }
2801
2802     // Test if any bit (outside of the Qnil bit) is on
2803     // RUBY_Qfalse  /* ...0000 0000 */
2804     // RUBY_Qnil    /* ...0000 1000 */
2805     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
2806     test(cb, val_opnd, imm_opnd(~Qnil));
2807
2808     // Get the branch target instruction offsets
2809     uint32_t next_idx = jit_next_insn_idx(jit);
2810     uint32_t jump_idx = next_idx + jump_offset;
2811     blockid_t next_block = { jit->iseq, next_idx };
2812     blockid_t jump_block = { jit->iseq, jump_idx };
2813
2814     // Generate the branch instructions
2815     gen_branch(
2816         jit,
2817         ctx,
2818         jump_block,
2819         ctx,
2820         next_block,
2821         ctx,
2822         gen_branchunless_branch
2823     );
2824
2825     return YJIT_END_BLOCK;
2826 }
2827
2828 static void
2829 gen_branchnil_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
2830 {
2831     switch (shape) {
2832       case SHAPE_NEXT0:
2833         jne_ptr(cb, target1);
2834         break;
2835
2836       case SHAPE_NEXT1:
2837         je_ptr(cb, target0);
2838         break;
2839
2840       case SHAPE_DEFAULT:
2841         je_ptr(cb, target0);
2842         jmp_ptr(cb, target1);
2843         break;
2844     }
2845 }
2846
2847 static codegen_status_t
2848 gen_branchnil(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2849 {
2850     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2851
2852     // Check for interrupts, but only on backward branches that may create loops
2853     if (jump_offset < 0) {
2854         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2855         yjit_check_ints(cb, side_exit);
2856     }
2857
2858     // Test if the value is Qnil
2859     // RUBY_Qnil    /* ...0000 1000 */
2860     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
2861     cmp(cb, val_opnd, imm_opnd(Qnil));
2862
2863     // Get the branch target instruction offsets
2864     uint32_t next_idx = jit_next_insn_idx(jit);
2865     uint32_t jump_idx = next_idx + jump_offset;
2866     blockid_t next_block = { jit->iseq, next_idx };
2867     blockid_t jump_block = { jit->iseq, jump_idx };
2868
2869     // Generate the branch instructions
2870     gen_branch(
2871         jit,
2872         ctx,
2873         jump_block,
2874         ctx,
2875         next_block,
2876         ctx,
2877         gen_branchnil_branch
2878     );
2879
2880     return YJIT_END_BLOCK;
2881 }
2882
2883 static codegen_status_t
2884 gen_jump(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2885 {
2886     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2887
2888     // Check for interrupts, but only on backward branches that may create loops
2889     if (jump_offset < 0) {
2890         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2891         yjit_check_ints(cb, side_exit);
2892     }
2893
2894     // Get the branch target instruction offsets
2895     uint32_t jump_idx = jit_next_insn_idx(jit) + jump_offset;
2896     blockid_t jump_block = { jit->iseq, jump_idx };
2897
2898     // Generate the jump instruction
2899     gen_direct_jump(
2900         jit,
2901         ctx,
2902         jump_block
2903     );
2904
2905     return YJIT_END_BLOCK;
2906 }
2907
2908 /*
2909 Guard that self or a stack operand has the same class as `known_klass`, using
2910 `sample_instance` to speculate about the shape of the runtime value.
2911 FIXNUM and on-heap integers are treated as if they have distinct classes, and
2912 the guard generated for one will fail for the other.
2913
2914 Recompile as contingency if possible, or take side exit a last resort.
2915 */
2916 static bool
2917 jit_guard_known_klass(jitstate_t *jit, ctx_t *ctx, VALUE known_klass, insn_opnd_t insn_opnd, VALUE sample_instance, const int max_chain_depth, uint8_t *side_exit)
2918 {
2919     val_type_t val_type = ctx_get_opnd_type(ctx, insn_opnd);
2920
2921     if (known_klass == rb_cNilClass) {
2922         RUBY_ASSERT(!val_type.is_heap);
2923         if (val_type.type != ETYPE_NIL) {
2924             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2925
2926             ADD_COMMENT(cb, "guard object is nil");
2927             cmp(cb, REG0, imm_opnd(Qnil));
2928             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2929
2930             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_NIL);
2931         }
2932     }
2933     else if (known_klass == rb_cTrueClass) {
2934         RUBY_ASSERT(!val_type.is_heap);
2935         if (val_type.type != ETYPE_TRUE) {
2936             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2937
2938             ADD_COMMENT(cb, "guard object is true");
2939             cmp(cb, REG0, imm_opnd(Qtrue));
2940             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2941
2942             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_TRUE);
2943         }
2944     }
2945     else if (known_klass == rb_cFalseClass) {
2946         RUBY_ASSERT(!val_type.is_heap);
2947         if (val_type.type != ETYPE_FALSE) {
2948             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2949
2950             ADD_COMMENT(cb, "guard object is false");
2951             STATIC_ASSERT(qfalse_is_zero, Qfalse == 0);
2952             test(cb, REG0, REG0);
2953             jit_chain_guard(JCC_JNZ, jit, ctx, max_chain_depth, side_exit);
2954
2955             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_FALSE);
2956         }
2957     }
2958     else if (known_klass == rb_cInteger && FIXNUM_P(sample_instance)) {
2959         RUBY_ASSERT(!val_type.is_heap);
2960         // We will guard fixnum and bignum as though they were separate classes
2961         // BIGNUM can be handled by the general else case below
2962         if (val_type.type != ETYPE_FIXNUM || !val_type.is_imm) {
2963             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2964
2965             ADD_COMMENT(cb, "guard object is fixnum");
2966             test(cb, REG0, imm_opnd(RUBY_FIXNUM_FLAG));
2967             jit_chain_guard(JCC_JZ, jit, ctx, max_chain_depth, side_exit);
2968             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_FIXNUM);
2969         }
2970     }
2971     else if (known_klass == rb_cSymbol && STATIC_SYM_P(sample_instance)) {
2972         RUBY_ASSERT(!val_type.is_heap);
2973         // We will guard STATIC vs DYNAMIC as though they were separate classes
2974         // DYNAMIC symbols can be handled by the general else case below
2975         if (val_type.type != ETYPE_SYMBOL || !val_type.is_imm) {
2976             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2977
2978             ADD_COMMENT(cb, "guard object is static symbol");
2979             STATIC_ASSERT(special_shift_is_8, RUBY_SPECIAL_SHIFT == 8);
2980             cmp(cb, REG0_8, imm_opnd(RUBY_SYMBOL_FLAG));
2981             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2982             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_STATIC_SYMBOL);
2983         }
2984     }
2985     else if (known_klass == rb_cFloat && FLONUM_P(sample_instance)) {
2986         RUBY_ASSERT(!val_type.is_heap);
2987         if (val_type.type != ETYPE_FLONUM || !val_type.is_imm) {
2988             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2989
2990             // We will guard flonum vs heap float as though they were separate classes
2991             ADD_COMMENT(cb, "guard object is flonum");
2992             mov(cb, REG1, REG0);
2993             and(cb, REG1, imm_opnd(RUBY_FLONUM_MASK));
2994             cmp(cb, REG1, imm_opnd(RUBY_FLONUM_FLAG));
2995             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2996             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_FLONUM);
2997         }
2998     }
2999     else if (FL_TEST(known_klass, FL_SINGLETON) && sample_instance == rb_attr_get(known_klass, id__attached__)) {
3000         // Singleton classes are attached to one specific object, so we can
3001         // avoid one memory access (and potentially the is_heap check) by
3002         // looking for the expected object directly.
3003         // Note that in case the sample instance has a singleton class that
3004         // doesn't attach to the sample instance, it means the sample instance
3005         // has an empty singleton class that hasn't been materialized yet. In
3006         // this case, comparing against the sample instance doesn't guarantee
3007         // that its singleton class is empty, so we can't avoid the memory
3008         // access. As an example, `Object.new.singleton_class` is an object in
3009         // this situation.
3010         ADD_COMMENT(cb, "guard known object with singleton class");
3011         // TODO: jit_mov_gc_ptr keeps a strong reference, which leaks the object.
3012         jit_mov_gc_ptr(jit, cb, REG1, sample_instance);
3013         cmp(cb, REG0, REG1);
3014         jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
3015     }
3016     else {
3017         RUBY_ASSERT(!val_type.is_imm);
3018
3019         // Check that the receiver is a heap object
3020         // Note: if we get here, the class doesn't have immediate instances.
3021         if (!val_type.is_heap) {
3022             ADD_COMMENT(cb, "guard not immediate");
3023             RUBY_ASSERT(Qfalse < Qnil);
3024             test(cb, REG0, imm_opnd(RUBY_IMMEDIATE_MASK));
3025             jit_chain_guard(JCC_JNZ, jit, ctx, max_chain_depth, side_exit);
3026             cmp(cb, REG0, imm_opnd(Qnil));
3027             jit_chain_guard(JCC_JBE, jit, ctx, max_chain_depth, side_exit);
3028
3029             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_HEAP);
3030         }
3031
3032         x86opnd_t klass_opnd = mem_opnd(64, REG0, offsetof(struct RBasic, klass));
3033
3034         // Bail if receiver class is different from known_klass
3035         // TODO: jit_mov_gc_ptr keeps a strong reference, which leaks the class.
3036         ADD_COMMENT(cb, "guard known class");
3037         jit_mov_gc_ptr(jit, cb, REG1, known_klass);
3038         cmp(cb, klass_opnd, REG1);
3039         jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
3040     }
3041
3042     return true;
3043 }
3044
3045 // Generate ancestry guard for protected callee.
3046 // Calls to protected callees only go through when self.is_a?(klass_that_defines_the_callee).
3047 static void
3048 jit_protected_callee_ancestry_guard(jitstate_t *jit, codeblock_t *cb, const rb_callable_method_entry_t *cme, uint8_t *side_exit)
3049 {
3050     // See vm_call_method().
3051     mov(cb, C_ARG_REGS[0], member_opnd(REG_CFP, rb_control_frame_t, self));
3052     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[1], cme->defined_class);
3053     // Note: PC isn't written to current control frame as rb_is_kind_of() shouldn't raise.
3054     // VALUE rb_obj_is_kind_of(VALUE obj, VALUE klass);
3055     call_ptr(cb, REG0, (void *)&rb_obj_is_kind_of);
3056     test(cb, RAX, RAX);
3057     jz_ptr(cb, COUNTED_EXIT(jit, side_exit, send_se_protected_check_failed));
3058 }
3059
3060 // Return true when the codegen function generates code.
3061 // known_recv_klass is non-NULL when the caller has used jit_guard_known_klass().
3062 // See yjit_reg_method().
3063 typedef bool (*method_codegen_t)(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass);
3064
3065 // Register a specialized codegen function for a particular method. Note that
3066 // the if the function returns true, the code it generates runs without a
3067 // control frame and without interrupt checks. To avoid creating observable
3068 // behavior changes, the codegen function should only target simple code paths
3069 // that do not allocate and do not make method calls.
3070 static void
3071 yjit_reg_method(VALUE klass, const char *mid_str, method_codegen_t gen_fn)
3072 {
3073     ID mid = rb_intern(mid_str);
3074     const rb_method_entry_t *me = rb_method_entry_at(klass, mid);
3075
3076     if (!me) {
3077         rb_bug("undefined optimized method: %s", rb_id2name(mid));
3078     }
3079
3080     // For now, only cfuncs are supported
3081     RUBY_ASSERT(me && me->def);
3082     RUBY_ASSERT(me->def->type == VM_METHOD_TYPE_CFUNC);
3083
3084     st_insert(yjit_method_codegen_table, (st_data_t)me->def->method_serial, (st_data_t)gen_fn);
3085 }
3086
3087 // Codegen for rb_obj_not().
3088 // Note, caller is responsible for generating all the right guards, including
3089 // arity guards.
3090 static bool
3091 jit_rb_obj_not(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3092 {
3093     const val_type_t recv_opnd = ctx_get_opnd_type(ctx, OPND_STACK(0));
3094
3095     if (recv_opnd.type == ETYPE_NIL || recv_opnd.type == ETYPE_FALSE) {
3096         ADD_COMMENT(cb, "rb_obj_not(nil_or_false)");
3097         ctx_stack_pop(ctx, 1);
3098         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_TRUE);
3099         mov(cb, out_opnd, imm_opnd(Qtrue));
3100     }
3101     else if (recv_opnd.is_heap || recv_opnd.type != ETYPE_UNKNOWN) {
3102         // Note: recv_opnd.type != ETYPE_NIL && recv_opnd.type != ETYPE_FALSE.
3103         ADD_COMMENT(cb, "rb_obj_not(truthy)");
3104         ctx_stack_pop(ctx, 1);
3105         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_FALSE);
3106         mov(cb, out_opnd, imm_opnd(Qfalse));
3107     }
3108     else {
3109         // jit_guard_known_klass() already ran on the receiver which should
3110         // have deduced deduced the type of the receiver. This case should be
3111         // rare if not unreachable.
3112         return false;
3113     }
3114     return true;
3115 }
3116
3117 // Codegen for rb_true()
3118 static bool
3119 jit_rb_true(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3120 {
3121     ADD_COMMENT(cb, "nil? == true");
3122     ctx_stack_pop(ctx, 1);
3123     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_TRUE);
3124     mov(cb, stack_ret, imm_opnd(Qtrue));
3125     return true;
3126 }
3127
3128 // Codegen for rb_false()
3129 static bool
3130 jit_rb_false(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3131 {
3132     ADD_COMMENT(cb, "nil? == false");
3133     ctx_stack_pop(ctx, 1);
3134     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_FALSE);
3135     mov(cb, stack_ret, imm_opnd(Qfalse));
3136     return true;
3137 }
3138
3139 // Codegen for rb_obj_equal()
3140 // object identity comparison
3141 static bool
3142 jit_rb_obj_equal(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3143 {
3144     ADD_COMMENT(cb, "equal?");
3145     x86opnd_t obj1 = ctx_stack_pop(ctx, 1);
3146     x86opnd_t obj2 = ctx_stack_pop(ctx, 1);
3147
3148     mov(cb, REG0, obj1);
3149     cmp(cb, REG0, obj2);
3150     mov(cb, REG0, imm_opnd(Qtrue));
3151     mov(cb, REG1, imm_opnd(Qfalse));
3152     cmovne(cb, REG0, REG1);
3153
3154     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_IMM);
3155     mov(cb, stack_ret, REG0);
3156     return true;
3157 }
3158
3159 static VALUE
3160 yjit_str_bytesize(VALUE str)
3161 {
3162     return LONG2NUM(RSTRING_LEN(str));
3163 }
3164
3165 static bool
3166 jit_rb_str_bytesize(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3167 {
3168     ADD_COMMENT(cb, "String#bytesize");
3169
3170     x86opnd_t recv = ctx_stack_pop(ctx, 1);
3171     mov(cb, C_ARG_REGS[0], recv);
3172     call_ptr(cb, REG0, (void *)&yjit_str_bytesize);
3173
3174     x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_FIXNUM);
3175     mov(cb, out_opnd, RAX);
3176
3177     return true;
3178 }
3179
3180 // Codegen for rb_str_to_s()
3181 // When String#to_s is called on a String instance, the method returns self and
3182 // most of the overhead comes from setting up the method call. We observed that
3183 // this situation happens a lot in some workloads.
3184 static bool
3185 jit_rb_str_to_s(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *recv_known_klass)
3186 {
3187     if (recv_known_klass && *recv_known_klass == rb_cString) {
3188         ADD_COMMENT(cb, "to_s on plain string");
3189         // The method returns the receiver, which is already on the stack.
3190         // No stack movement.
3191         return true;
3192     }
3193     return false;
3194 }
3195
3196 static bool
3197 jit_thread_s_current(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *recv_known_klass)
3198 {
3199     ADD_COMMENT(cb, "Thread.current");
3200     ctx_stack_pop(ctx, 1);
3201
3202     // ec->thread_ptr
3203     mov(cb, REG0, member_opnd(REG_EC, rb_execution_context_t, thread_ptr));
3204
3205     // thread->self
3206     mov(cb, REG0, member_opnd(REG0, rb_thread_t, self));
3207
3208     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HEAP);
3209     mov(cb, stack_ret, REG0);
3210     return true;
3211 }
3212
3213 // Check if we know how to codegen for a particular cfunc method
3214 static method_codegen_t
3215 lookup_cfunc_codegen(const rb_method_definition_t *def)
3216 {
3217     method_codegen_t gen_fn;
3218     if (st_lookup(yjit_method_codegen_table, def->method_serial, (st_data_t *)&gen_fn)) {
3219         return gen_fn;
3220     }
3221     return NULL;
3222 }
3223
3224 // Is anyone listening for :c_call and :c_return event currently?
3225 static bool
3226 c_method_tracing_currently_enabled(const jitstate_t *jit)
3227 {
3228     rb_event_flag_t tracing_events;
3229     if (rb_multi_ractor_p()) {
3230         tracing_events = ruby_vm_event_enabled_global_flags;
3231     }
3232     else {
3233         // At the time of writing, events are never removed from
3234         // ruby_vm_event_enabled_global_flags so always checking using it would
3235         // mean we don't compile even after tracing is disabled.
3236         tracing_events = rb_ec_ractor_hooks(jit->ec)->events;
3237     }
3238
3239     return tracing_events & (RUBY_EVENT_C_CALL | RUBY_EVENT_C_RETURN);
3240 }
3241
3242 // Called at runtime to build hashes of passed kwargs
3243 static VALUE
3244 yjit_runtime_build_kwhash(const struct rb_callinfo *ci, const VALUE *sp) {
3245     // similar to args_kw_argv_to_hash
3246     const VALUE *const passed_keywords = vm_ci_kwarg(ci)->keywords;
3247     const int kw_len = vm_ci_kwarg(ci)->keyword_len;
3248     const VALUE h = rb_hash_new_with_size(kw_len);
3249
3250     for (int i = 0; i < kw_len; i++) {
3251         rb_hash_aset(h, passed_keywords[i], (sp - kw_len)[i]);
3252     }
3253     return h;
3254 }
3255
3256 static codegen_status_t
3257 gen_send_cfunc(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *recv_known_klass)
3258 {
3259     const rb_method_cfunc_t *cfunc = UNALIGNED_MEMBER_PTR(cme->def, body.cfunc);
3260
3261     const struct rb_callinfo_kwarg *kw_arg = vm_ci_kwarg(ci);
3262     const int kw_arg_num = kw_arg ? kw_arg->keyword_len : 0;
3263
3264     // Number of args which will be passed through to the callee
3265     // This is adjusted by the kwargs being combined into a hash.
3266     const int passed_argc = kw_arg ? argc - kw_arg_num + 1 : argc;
3267
3268     // If the argument count doesn't match
3269     if (cfunc->argc >= 0 && cfunc->argc != passed_argc) {
3270         GEN_COUNTER_INC(cb, send_cfunc_argc_mismatch);
3271         return YJIT_CANT_COMPILE;
3272     }
3273
3274     // Don't JIT functions that need C stack arguments for now
3275     if (cfunc->argc >= 0 && passed_argc + 1 > NUM_C_ARG_REGS) {
3276         GEN_COUNTER_INC(cb, send_cfunc_toomany_args);
3277         return YJIT_CANT_COMPILE;
3278     }
3279
3280     if (c_method_tracing_currently_enabled(jit)) {
3281         // Don't JIT if tracing c_call or c_return
3282         GEN_COUNTER_INC(cb, send_cfunc_tracing);
3283         return YJIT_CANT_COMPILE;
3284     }
3285
3286     // Delegate to codegen for C methods if we have it.
3287     {
3288         method_codegen_t known_cfunc_codegen;
3289         if (!kw_arg && (known_cfunc_codegen = lookup_cfunc_codegen(cme->def))) {
3290             if (known_cfunc_codegen(jit, ctx, ci, cme, block, argc, recv_known_klass)) {
3291                 // cfunc codegen generated code. Terminate the block so
3292                 // there isn't multiple calls in the same block.
3293                 jit_jump_to_next_insn(jit, ctx);
3294                 return YJIT_END_BLOCK;
3295             }
3296         }
3297     }
3298
3299     // Callee method ID
3300     //ID mid = vm_ci_mid(ci);
3301     //printf("JITting call to C function \"%s\", argc: %lu\n", rb_id2name(mid), argc);
3302     //print_str(cb, "");
3303     //print_str(cb, "calling CFUNC:");
3304     //print_str(cb, rb_id2name(mid));
3305     //print_str(cb, "recv");
3306     //print_ptr(cb, recv);
3307
3308     // Create a side-exit to fall back to the interpreter
3309     uint8_t *side_exit = yjit_side_exit(jit, ctx);
3310
3311     // Check for interrupts
3312     yjit_check_ints(cb, side_exit);
3313
3314     // Stack overflow check
3315     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
3316     // REG_CFP <= REG_SP + 4 * sizeof(VALUE) + sizeof(rb_control_frame_t)
3317     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * 4 + 2 * sizeof(rb_control_frame_t)));
3318     cmp(cb, REG_CFP, REG0);
3319     jle_ptr(cb, COUNTED_EXIT(jit, side_exit, send_se_cf_overflow));
3320
3321     // Points to the receiver operand on the stack
3322     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
3323
3324     // Store incremented PC into current control frame in case callee raises.
3325     jit_save_pc(jit, REG0);
3326
3327     if (block) {
3328         // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
3329         // VM_CFP_TO_CAPTURED_BLCOK does &cfp->self, rb_captured_block->code.iseq aliases
3330         // with cfp->block_code.
3331         jit_mov_gc_ptr(jit, cb, REG0, (VALUE)block);
3332         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, block_code), REG0);
3333     }
3334
3335     // Increment the stack pointer by 3 (in the callee)
3336     // sp += 3
3337     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * 3));
3338
3339     // Write method entry at sp[-3]
3340     // sp[-3] = me;
3341     // Put compile time cme into REG1. It's assumed to be valid because we are notified when
3342     // any cme we depend on become outdated. See rb_yjit_method_lookup_change().
3343     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)cme);
3344     mov(cb, mem_opnd(64, REG0, 8 * -3), REG1);
3345
3346     // Write block handler at sp[-2]
3347     // sp[-2] = block_handler;
3348     if (block) {
3349         // reg1 = VM_BH_FROM_ISEQ_BLOCK(VM_CFP_TO_CAPTURED_BLOCK(reg_cfp));
3350         lea(cb, REG1, member_opnd(REG_CFP, rb_control_frame_t, self));
3351         or(cb, REG1, imm_opnd(1));
3352         mov(cb, mem_opnd(64, REG0, 8 * -2), REG1);
3353     }
3354     else {
3355         mov(cb, mem_opnd(64, REG0, 8 * -2), imm_opnd(VM_BLOCK_HANDLER_NONE));
3356     }
3357
3358     // Write env flags at sp[-1]
3359     // sp[-1] = frame_type;
3360     uint64_t frame_type = VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL;
3361     if (kw_arg) {
3362         frame_type |= VM_FRAME_FLAG_CFRAME_KW;
3363     }
3364     mov(cb, mem_opnd(64, REG0, 8 * -1), imm_opnd(frame_type));
3365
3366     // Allocate a new CFP (ec->cfp--)
3367     sub(
3368         cb,
3369         member_opnd(REG_EC, rb_execution_context_t, cfp),
3370         imm_opnd(sizeof(rb_control_frame_t))
3371     );
3372
3373     // Setup the new frame
3374     // *cfp = (const struct rb_control_frame_struct) {
3375     //    .pc         = 0,
3376     //    .sp         = sp,
3377     //    .iseq       = 0,
3378     //    .self       = recv,
3379     //    .ep         = sp - 1,
3380     //    .block_code = 0,
3381     //    .__bp__     = sp,
3382     // };
3383     mov(cb, REG1, member_opnd(REG_EC, rb_execution_context_t, cfp));
3384     mov(cb, member_opnd(REG1, rb_control_frame_t, pc), imm_opnd(0));
3385     mov(cb, member_opnd(REG1, rb_control_frame_t, sp), REG0);
3386     mov(cb, member_opnd(REG1, rb_control_frame_t, iseq), imm_opnd(0));
3387     mov(cb, member_opnd(REG1, rb_control_frame_t, block_code), imm_opnd(0));
3388     mov(cb, member_opnd(REG1, rb_control_frame_t, __bp__), REG0);
3389     sub(cb, REG0, imm_opnd(sizeof(VALUE)));
3390     mov(cb, member_opnd(REG1, rb_control_frame_t, ep), REG0);
3391     mov(cb, REG0, recv);
3392     mov(cb, member_opnd(REG1, rb_control_frame_t, self), REG0);
3393
3394     // Verify that we are calling the right function
3395     if (YJIT_CHECK_MODE > 0) {
3396         // Call check_cfunc_dispatch
3397         mov(cb, C_ARG_REGS[0], recv);
3398         jit_mov_gc_ptr(jit, cb, C_ARG_REGS[1], (VALUE)ci);
3399         mov(cb, C_ARG_REGS[2], const_ptr_opnd((void *)cfunc->func));
3400         jit_mov_gc_ptr(jit, cb, C_ARG_REGS[3], (VALUE)cme);
3401         call_ptr(cb, REG0, (void *)&check_cfunc_dispatch);
3402     }
3403
3404     if (kw_arg) {
3405         // Build a hash from all kwargs passed
3406         jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], (VALUE)ci);
3407         lea(cb, C_ARG_REGS[1], ctx_sp_opnd(ctx, 0));
3408         call_ptr(cb, REG0, (void *)&yjit_runtime_build_kwhash);
3409
3410         // Replace the stack location at the start of kwargs with the new hash
3411         x86opnd_t stack_opnd = ctx_stack_opnd(ctx, argc - passed_argc);
3412         mov(cb, stack_opnd, RAX);
3413     }
3414
3415     // Non-variadic method
3416     if (cfunc->argc >= 0) {
3417         // Copy the arguments from the stack to the C argument registers
3418         // self is the 0th argument and is at index argc from the stack top
3419         for (int32_t i = 0; i < passed_argc + 1; ++i)
3420         {
3421             x86opnd_t stack_opnd = ctx_stack_opnd(ctx, argc - i);
3422             x86opnd_t c_arg_reg = C_ARG_REGS[i];
3423             mov(cb, c_arg_reg, stack_opnd);
3424         }
3425     }
3426     // Variadic method
3427     if (cfunc->argc == -1) {
3428         // The method gets a pointer to the first argument
3429         // rb_f_puts(int argc, VALUE *argv, VALUE recv)
3430         mov(cb, C_ARG_REGS[0], imm_opnd(passed_argc));
3431         lea(cb, C_ARG_REGS[1], ctx_stack_opnd(ctx, argc - 1));
3432         mov(cb, C_ARG_REGS[2], ctx_stack_opnd(ctx, argc));
3433     }
3434     // Variadic method with Ruby array
3435     if (cfunc->argc == -2) {
3436         // Create a Ruby array from the arguments.
3437         //
3438         // This follows similar behaviour to vm_call_cfunc_with_frame() and
3439         // call_cfunc_m2(). We use rb_ec_ary_new_from_values() instead of
3440         // rb_ary_new4() since we have REG_EC available.
3441         //
3442         // Before getting here we will have set the new CFP in the EC, and the
3443         // stack at CFP's SP will contain the values we are inserting into the
3444         // Array, so they will be properly marked if we hit a GC.
3445
3446         // rb_ec_ary_new_from_values(rb_execution_context_t *ec, long n, const VLAUE *elts)
3447         mov(cb, C_ARG_REGS[0], REG_EC);
3448         mov(cb, C_ARG_REGS[1], imm_opnd(passed_argc));
3449         lea(cb, C_ARG_REGS[2], ctx_stack_opnd(ctx, argc - 1));
3450         call_ptr(cb, REG0, (void *)rb_ec_ary_new_from_values);
3451
3452         // rb_file_s_join(VALUE recv, VALUE args)
3453         mov(cb, C_ARG_REGS[0], ctx_stack_opnd(ctx, argc));
3454         mov(cb, C_ARG_REGS[1], RAX);
3455     }
3456
3457     // Pop the C function arguments from the stack (in the caller)
3458     ctx_stack_pop(ctx, argc + 1);
3459
3460     // Write interpreter SP into CFP.
3461     // Needed in case the callee yields to the block.
3462     jit_save_sp(jit, ctx);
3463
3464     // Call the C function
3465     // VALUE ret = (cfunc->func)(recv, argv[0], argv[1]);
3466     // cfunc comes from compile-time cme->def, which we assume to be stable.
3467     // Invalidation logic is in rb_yjit_method_lookup_change()
3468     call_ptr(cb, REG0, (void*)cfunc->func);
3469
3470     // Record code position for TracePoint patching. See full_cfunc_return().
3471     record_global_inval_patch(cb, outline_full_cfunc_return_pos);
3472
3473     // Push the return value on the Ruby stack
3474     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
3475     mov(cb, stack_ret, RAX);
3476
3477     // Pop the stack frame (ec->cfp++)
3478     add(
3479         cb,
3480         member_opnd(REG_EC, rb_execution_context_t, cfp),
3481         imm_opnd(sizeof(rb_control_frame_t))
3482     );
3483
3484     // cfunc calls may corrupt types
3485     ctx_clear_local_types(ctx);
3486
3487     // Note: the return block of gen_send_iseq() has ctx->sp_offset == 1
3488     // which allows for sharing the same successor.
3489
3490     // Jump (fall through) to the call continuation block
3491     // We do this to end the current block after the call
3492     jit_jump_to_next_insn(jit, ctx);
3493     return YJIT_END_BLOCK;
3494 }
3495
3496 static void
3497 gen_return_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
3498 {
3499     switch (shape) {
3500       case SHAPE_NEXT0:
3501       case SHAPE_NEXT1:
3502         RUBY_ASSERT(false);
3503         break;
3504
3505       case SHAPE_DEFAULT:
3506         mov(cb, REG0, const_ptr_opnd(target0));
3507         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, jit_return), REG0);
3508         break;
3509     }
3510 }
3511
3512 // If true, the iseq is leaf and it can be replaced by a single C call.
3513 static bool
3514 rb_leaf_invokebuiltin_iseq_p(const rb_iseq_t *iseq)
3515 {
3516     unsigned int invokebuiltin_len = insn_len(BIN(opt_invokebuiltin_delegate_leave));
3517     unsigned int leave_len = insn_len(BIN(leave));
3518
3519     return (iseq->body->iseq_size == (invokebuiltin_len + leave_len) &&
3520         rb_vm_insn_addr2opcode((void *)iseq->body->iseq_encoded[0]) == BIN(opt_invokebuiltin_delegate_leave) &&
3521         rb_vm_insn_addr2opcode((void *)iseq->body->iseq_encoded[invokebuiltin_len]) == BIN(leave) &&
3522         iseq->body->builtin_inline_p
3523     );
3524  }
3525
3526 // Return an rb_builtin_function if the iseq contains only that leaf builtin function.
3527 static const struct rb_builtin_function*
3528 rb_leaf_builtin_function(const rb_iseq_t *iseq)
3529 {
3530     if (!rb_leaf_invokebuiltin_iseq_p(iseq))
3531         return NULL;
3532     return (const struct rb_builtin_function *)iseq->body->iseq_encoded[1];
3533 }
3534
3535 static codegen_status_t
3536 gen_send_iseq(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, int32_t argc)
3537 {
3538     const rb_iseq_t *iseq = def_iseq_ptr(cme->def);
3539
3540     // When you have keyword arguments, there is an extra object that gets
3541     // placed on the stack the represents a bitmap of the keywords that were not
3542     // specified at the call site. We need to keep track of the fact that this
3543     // value is present on the stack in order to properly set up the callee's
3544     // stack pointer.
3545     const bool doing_kw_call = iseq->body->param.flags.has_kw;
3546     const bool supplying_kws = vm_ci_flag(ci) & VM_CALL_KWARG;
3547
3548     if (vm_ci_flag(ci) & VM_CALL_TAILCALL) {
3549         // We can't handle tailcalls
3550         GEN_COUNTER_INC(cb, send_iseq_tailcall);
3551         return YJIT_CANT_COMPILE;
3552     }
3553
3554     // No support for callees with these parameters yet as they require allocation
3555     // or complex handling.
3556     if (iseq->body->param.flags.has_rest ||
3557         iseq->body->param.flags.has_post ||
3558         iseq->body->param.flags.has_kwrest) {
3559         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3560         return YJIT_CANT_COMPILE;
3561     }
3562
3563     // If we have keyword arguments being passed to a callee that only takes
3564     // positionals, then we need to allocate a hash. For now we're going to
3565     // call that too complex and bail.
3566     if (supplying_kws && !iseq->body->param.flags.has_kw) {
3567         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3568         return YJIT_CANT_COMPILE;
3569     }
3570
3571     // If we have a method accepting no kwargs (**nil), exit if we have passed
3572     // it any kwargs.
3573     if (supplying_kws && iseq->body->param.flags.accepts_no_kwarg) {
3574         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3575         return YJIT_CANT_COMPILE;
3576     }
3577
3578     // For computing number of locals to setup for the callee
3579     int num_params = iseq->body->param.size;
3580
3581     // Block parameter handling. This mirrors setup_parameters_complex().
3582     if (iseq->body->param.flags.has_block) {
3583         if (iseq->body->local_iseq == iseq) {
3584             // Block argument is passed through EP and not setup as a local in
3585             // the callee.
3586             num_params--;
3587         }
3588         else {
3589             // In this case (param.flags.has_block && local_iseq != iseq),
3590             // the block argument is setup as a local variable and requires
3591             // materialization (allocation). Bail.
3592             GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3593             return YJIT_CANT_COMPILE;
3594         }
3595     }
3596
3597     uint32_t start_pc_offset = 0;
3598
3599     const int required_num = iseq->body->param.lead_num;
3600
3601     // This struct represents the metadata about the caller-specified
3602     // keyword arguments.
3603     const struct rb_callinfo_kwarg *kw_arg = vm_ci_kwarg(ci);
3604     const int kw_arg_num = kw_arg ? kw_arg->keyword_len : 0;
3605
3606     // Arity handling and optional parameter setup
3607     const int opts_filled = argc - required_num - kw_arg_num;
3608     const int opt_num = iseq->body->param.opt_num;
3609     const int opts_missing = opt_num - opts_filled;
3610
3611     if (opts_filled < 0 || opts_filled > opt_num) {
3612         GEN_COUNTER_INC(cb, send_iseq_arity_error);
3613         return YJIT_CANT_COMPILE;
3614     }
3615
3616     // If we have unfilled optional arguments and keyword arguments then we
3617     // would need to move adjust the arguments location to account for that.
3618     // For now we aren't handling this case.
3619     if (doing_kw_call && opts_missing > 0) {
3620         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3621         return YJIT_CANT_COMPILE;
3622     }
3623
3624     if (opt_num > 0) {
3625         num_params -= opt_num - opts_filled;
3626         start_pc_offset = (uint32_t)iseq->body->param.opt_table[opts_filled];
3627     }
3628
3629     if (doing_kw_call) {
3630         // Here we're calling a method with keyword arguments and specifying
3631         // keyword arguments at this call site.
3632
3633         // This struct represents the metadata about the callee-specified
3634         // keyword parameters.
3635         const struct rb_iseq_param_keyword *keyword = iseq->body->param.keyword;
3636
3637         int required_kwargs_filled = 0;
3638
3639         if (keyword->num > 30) {
3640             // We have so many keywords that (1 << num) encoded as a FIXNUM
3641             // (which shifts it left one more) no longer fits inside a 32-bit
3642             // immediate.
3643             GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3644             return YJIT_CANT_COMPILE;
3645         }
3646
3647         // Check that the kwargs being passed are valid
3648         if (supplying_kws) {
3649             // This is the list of keyword arguments that the callee specified
3650             // in its initial declaration.
3651             const ID *callee_kwargs = keyword->table;
3652
3653             // Here we're going to build up a list of the IDs that correspond to
3654             // the caller-specified keyword arguments. If they're not in the
3655             // same order as the order specified in the callee declaration, then
3656             // we're going to need to generate some code to swap values around
3657             // on the stack.
3658             ID *caller_kwargs = ALLOCA_N(VALUE, kw_arg->keyword_len);
3659             for (int kwarg_idx = 0; kwarg_idx < kw_arg->keyword_len; kwarg_idx++)
3660                 caller_kwargs[kwarg_idx] = SYM2ID(kw_arg->keywords[kwarg_idx]);
3661
3662             // First, we're going to be sure that the names of every
3663             // caller-specified keyword argument correspond to a name in the
3664             // list of callee-specified keyword parameters.
3665             for (int caller_idx = 0; caller_idx < kw_arg->keyword_len; caller_idx++) {
3666                 int callee_idx;
3667
3668                 for (callee_idx = 0; callee_idx < keyword->num; callee_idx++) {
3669                     if (caller_kwargs[caller_idx] == callee_kwargs[callee_idx]) {
3670                         break;
3671                     }
3672                 }
3673
3674                 // If the keyword was never found, then we know we have a
3675                 // mismatch in the names of the keyword arguments, so we need to
3676                 // bail.
3677                 if (callee_idx == keyword->num) {
3678                     GEN_COUNTER_INC(cb, send_iseq_kwargs_mismatch);
3679                     return YJIT_CANT_COMPILE;
3680                 }
3681
3682                 // Keep a count to ensure all required kwargs are specified
3683                 if (callee_idx < keyword->required_num) {
3684                     required_kwargs_filled++;
3685                 }
3686             }
3687         }
3688
3689         RUBY_ASSERT(required_kwargs_filled <= keyword->required_num);
3690         if (required_kwargs_filled != keyword->required_num) {
3691             GEN_COUNTER_INC(cb, send_iseq_kwargs_mismatch);
3692             return YJIT_CANT_COMPILE;
3693         }
3694     }
3695
3696     // Number of locals that are not parameters
3697     const int num_locals = iseq->body->local_table_size - num_params;
3698
3699     // Create a side-exit to fall back to the interpreter
3700     uint8_t *side_exit = yjit_side_exit(jit, ctx);
3701
3702     // Check for interrupts
3703     yjit_check_ints(cb, side_exit);
3704
3705     const struct rb_builtin_function *leaf_builtin = rb_leaf_builtin_function(iseq);
3706
3707     if (leaf_builtin && !block && leaf_builtin->argc + 1 <= NUM_C_ARG_REGS) {
3708         ADD_COMMENT(cb, "inlined leaf builtin");
3709
3710         // Call the builtin func (ec, recv, arg1, arg2, ...)
3711         mov(cb, C_ARG_REGS[0], REG_EC);
3712
3713         // Copy self and arguments
3714         for (int32_t i = 0; i < leaf_builtin->argc + 1; i++) {
3715             x86opnd_t stack_opnd = ctx_stack_opnd(ctx, leaf_builtin->argc - i);
3716             x86opnd_t c_arg_reg = C_ARG_REGS[i + 1];
3717             mov(cb, c_arg_reg, stack_opnd);
3718         }
3719         ctx_stack_pop(ctx, leaf_builtin->argc + 1);
3720         call_ptr(cb, REG0, (void *)leaf_builtin->func_ptr);
3721
3722         // Push the return value
3723         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
3724         mov(cb, stack_ret, RAX);
3725
3726         // Note: assuming that the leaf builtin doesn't change local variables here.
3727         // Seems like a safe assumption.
3728
3729         return YJIT_KEEP_COMPILING;
3730     }
3731
3732     // Stack overflow check
3733     // Note that vm_push_frame checks it against a decremented cfp, hence the multiply by 2.
3734     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
3735     ADD_COMMENT(cb, "stack overflow check");
3736     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * (num_locals + iseq->body->stack_max) + 2 * sizeof(rb_control_frame_t)));
3737     cmp(cb, REG_CFP, REG0);
3738     jle_ptr(cb, COUNTED_EXIT(jit, side_exit, send_se_cf_overflow));
3739
3740     if (doing_kw_call) {
3741         // Here we're calling a method with keyword arguments and specifying
3742         // keyword arguments at this call site.
3743
3744         // Number of positional arguments the callee expects before the first
3745         // keyword argument
3746         const int args_before_kw = required_num + opt_num;
3747
3748         // This struct represents the metadata about the caller-specified
3749         // keyword arguments.
3750         int caller_keyword_len = 0;
3751         const VALUE *caller_keywords = NULL;
3752         if (vm_ci_kwarg(ci)) {
3753             caller_keyword_len = vm_ci_kwarg(ci)->keyword_len;
3754             caller_keywords = &vm_ci_kwarg(ci)->keywords[0];
3755         }
3756
3757         // This struct represents the metadata about the callee-specified
3758         // keyword parameters.
3759         const struct rb_iseq_param_keyword *const keyword = iseq->body->param.keyword;
3760
3761         ADD_COMMENT(cb, "keyword args");
3762
3763         // This is the list of keyword arguments that the callee specified
3764         // in its initial declaration.
3765         const ID *callee_kwargs = keyword->table;
3766
3767         int total_kwargs = keyword->num;
3768
3769         // Here we're going to build up a list of the IDs that correspond to
3770         // the caller-specified keyword arguments. If they're not in the
3771         // same order as the order specified in the callee declaration, then
3772         // we're going to need to generate some code to swap values around
3773         // on the stack.
3774         ID *caller_kwargs = ALLOCA_N(VALUE, total_kwargs);
3775         int kwarg_idx;
3776         for (kwarg_idx = 0; kwarg_idx < caller_keyword_len; kwarg_idx++) {
3777             caller_kwargs[kwarg_idx] = SYM2ID(caller_keywords[kwarg_idx]);
3778         }
3779
3780         int unspecified_bits = 0;
3781
3782         for (int callee_idx = keyword->required_num; callee_idx < total_kwargs; callee_idx++) {
3783             bool already_passed = false;
3784             ID callee_kwarg = callee_kwargs[callee_idx];
3785
3786             for (int caller_idx = 0; caller_idx < caller_keyword_len; caller_idx++) {
3787                 if (caller_kwargs[caller_idx] == callee_kwarg) {
3788                     already_passed = true;
3789                     break;
3790                 }
3791             }
3792
3793             if (!already_passed) {
3794                 // Reserve space on the stack for each default value we'll be
3795                 // filling in (which is done in the next loop). Also increments
3796                 // argc so that the callee's SP is recorded correctly.
3797                 argc++;
3798                 x86opnd_t default_arg = ctx_stack_push(ctx, TYPE_UNKNOWN);
3799                 VALUE default_value = keyword->default_values[callee_idx - keyword->required_num];
3800
3801                 if (default_value == Qundef) {
3802                     // Qundef means that this value is not constant and must be
3803                     // recalculated at runtime, so we record it in unspecified_bits
3804                     // (Qnil is then used as a placeholder instead of Qundef).
3805                     unspecified_bits |= 0x01 << (callee_idx - keyword->required_num);
3806                     default_value = Qnil;
3807                 }
3808
3809                 // GC might move default_value.
3810                 jit_mov_gc_ptr(jit, cb, REG0, default_value);
3811                 mov(cb, default_arg, REG0);
3812
3813                 caller_kwargs[kwarg_idx++] = callee_kwarg;
3814             }
3815         }
3816         RUBY_ASSERT(kwarg_idx == total_kwargs);
3817
3818         // Next, we're going to loop through every keyword that was
3819         // specified by the caller and make sure that it's in the correct
3820         // place. If it's not we're going to swap it around with another one.
3821         for (kwarg_idx = 0; kwarg_idx < total_kwargs; kwarg_idx++) {
3822             ID callee_kwarg = callee_kwargs[kwarg_idx];
3823
3824             // If the argument is already in the right order, then we don't
3825             // need to generate any code since the expected value is already
3826             // in the right place on the stack.
3827             if (callee_kwarg == caller_kwargs[kwarg_idx]) continue;
3828
3829             // In this case the argument is not in the right place, so we
3830             // need to find its position where it _should_ be and swap with
3831             // that location.
3832             for (int swap_idx = kwarg_idx + 1; swap_idx < total_kwargs; swap_idx++) {
3833                 if (callee_kwarg == caller_kwargs[swap_idx]) {
3834                     // First we're going to generate the code that is going
3835                     // to perform the actual swapping at runtime.
3836                     stack_swap(ctx, cb, argc - 1 - swap_idx - args_before_kw, argc - 1 - kwarg_idx - args_before_kw, REG1, REG0);
3837
3838                     // Next we're going to do some bookkeeping on our end so
3839                     // that we know the order that the arguments are
3840                     // actually in now.
3841                     ID tmp = caller_kwargs[kwarg_idx];
3842                     caller_kwargs[kwarg_idx] = caller_kwargs[swap_idx];
3843                     caller_kwargs[swap_idx] = tmp;
3844
3845                     break;
3846                 }
3847             }
3848         }
3849
3850         // Keyword arguments cause a special extra local variable to be
3851         // pushed onto the stack that represents the parameters that weren't
3852         // explicitly given a value and have a non-constant default.
3853         mov(cb, ctx_stack_opnd(ctx, -1), imm_opnd(INT2FIX(unspecified_bits)));
3854     }
3855     // Points to the receiver operand on the stack
3856     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
3857
3858     // Store the updated SP on the current frame (pop arguments and receiver)
3859     ADD_COMMENT(cb, "store caller sp");
3860     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * -(argc + 1)));
3861     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG0);
3862
3863     // Store the next PC in the current frame
3864     jit_save_pc(jit, REG0);
3865
3866     if (block) {
3867         // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
3868         // VM_CFP_TO_CAPTURED_BLCOK does &cfp->self, rb_captured_block->code.iseq aliases
3869         // with cfp->block_code.
3870         jit_mov_gc_ptr(jit, cb, REG0, (VALUE)block);
3871         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, block_code), REG0);
3872     }
3873
3874     // Adjust the callee's stack pointer
3875     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * (3 + num_locals + doing_kw_call)));
3876
3877     // Initialize local variables to Qnil
3878     for (int i = 0; i < num_locals; i++) {
3879         mov(cb, mem_opnd(64, REG0, sizeof(VALUE) * (i - num_locals - 3)), imm_opnd(Qnil));
3880     }
3881
3882     ADD_COMMENT(cb, "push env");
3883     // Put compile time cme into REG1. It's assumed to be valid because we are notified when
3884     // any cme we depend on become outdated. See rb_yjit_method_lookup_change().
3885     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)cme);
3886     // Write method entry at sp[-3]
3887     // sp[-3] = me;
3888     mov(cb, mem_opnd(64, REG0, 8 * -3), REG1);
3889
3890     // Write block handler at sp[-2]
3891     // sp[-2] = block_handler;
3892     if (block) {
3893         // reg1 = VM_BH_FROM_ISEQ_BLOCK(VM_CFP_TO_CAPTURED_BLOCK(reg_cfp));
3894         lea(cb, REG1, member_opnd(REG_CFP, rb_control_frame_t, self));
3895         or(cb, REG1, imm_opnd(1));
3896         mov(cb, mem_opnd(64, REG0, 8 * -2), REG1);
3897     }
3898     else {
3899         mov(cb, mem_opnd(64, REG0, 8 * -2), imm_opnd(VM_BLOCK_HANDLER_NONE));
3900     }
3901
3902     // Write env flags at sp[-1]
3903     // sp[-1] = frame_type;
3904     uint64_t frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL;
3905     mov(cb, mem_opnd(64, REG0, 8 * -1), imm_opnd(frame_type));
3906
3907     ADD_COMMENT(cb, "push callee CFP");
3908     // Allocate a new CFP (ec->cfp--)
3909     sub(cb, REG_CFP, imm_opnd(sizeof(rb_control_frame_t)));
3910     mov(cb, member_opnd(REG_EC, rb_execution_context_t, cfp), REG_CFP);
3911
3912     // Setup the new frame
3913     // *cfp = (const struct rb_control_frame_struct) {
3914     //    .pc         = pc,
3915     //    .sp         = sp,
3916     //    .iseq       = iseq,
3917     //    .self       = recv,
3918     //    .ep         = sp - 1,
3919     //    .block_code = 0,
3920     //    .__bp__     = sp,
3921     // };
3922     mov(cb, REG1, recv);
3923     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, self), REG1);
3924     mov(cb, REG_SP, REG0); // Switch to the callee's REG_SP
3925     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG0);
3926     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, __bp__), REG0);
3927     sub(cb, REG0, imm_opnd(sizeof(VALUE)));
3928     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, ep), REG0);
3929     jit_mov_gc_ptr(jit, cb, REG0, (VALUE)iseq);
3930     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, iseq), REG0);
3931     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, block_code), imm_opnd(0));
3932
3933     // No need to set cfp->pc since the callee sets it whenever calling into routines
3934     // that could look at it through jit_save_pc().
3935     // mov(cb, REG0, const_ptr_opnd(start_pc));
3936     // mov(cb, member_opnd(REG_CFP, rb_control_frame_t, pc), REG0);
3937
3938     // Stub so we can return to JITted code
3939     blockid_t return_block = { jit->iseq, jit_next_insn_idx(jit) };
3940
3941     // Create a context for the callee
3942     ctx_t callee_ctx = DEFAULT_CTX;
3943
3944     // Set the argument types in the callee's context
3945     for (int32_t arg_idx = 0; arg_idx < argc; ++arg_idx) {
3946         val_type_t arg_type = ctx_get_opnd_type(ctx, OPND_STACK(argc - arg_idx - 1));
3947         ctx_set_local_type(&callee_ctx, arg_idx, arg_type);
3948     }
3949     val_type_t recv_type = ctx_get_opnd_type(ctx, OPND_STACK(argc));
3950     ctx_upgrade_opnd_type(&callee_ctx, OPND_SELF, recv_type);
3951
3952     // The callee might change locals through Kernel#binding and other means.
3953     ctx_clear_local_types(ctx);
3954
3955     // Pop arguments and receiver in return context, push the return value
3956     // After the return, sp_offset will be 1. The codegen for leave writes
3957     // the return value in case of JIT-to-JIT return.
3958     ctx_t return_ctx = *ctx;
3959     ctx_stack_pop(&return_ctx, argc + 1);
3960     ctx_stack_push(&return_ctx, TYPE_UNKNOWN);
3961     return_ctx.sp_offset = 1;
3962     return_ctx.chain_depth = 0;
3963
3964     // Write the JIT return address on the callee frame
3965     gen_branch(
3966         jit,
3967         ctx,
3968         return_block,
3969         &return_ctx,
3970         return_block,
3971         &return_ctx,
3972         gen_return_branch
3973     );
3974
3975     //print_str(cb, "calling Ruby func:");
3976     //print_str(cb, rb_id2name(vm_ci_mid(ci)));
3977
3978     // Directly jump to the entry point of the callee
3979     gen_direct_jump(
3980         jit,
3981         &callee_ctx,
3982         (blockid_t){ iseq, start_pc_offset }
3983     );
3984
3985     return YJIT_END_BLOCK;
3986 }
3987
3988 static codegen_status_t
3989 gen_struct_aref(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, VALUE comptime_recv, VALUE comptime_recv_klass) {
3990     if (vm_ci_argc(ci) != 0) {
3991         return YJIT_CANT_COMPILE;
3992     }
3993
3994     const unsigned int off = cme->def->body.optimized.index;
3995
3996     // Confidence checks
3997     RUBY_ASSERT_ALWAYS(RB_TYPE_P(comptime_recv, T_STRUCT));
3998     RUBY_ASSERT_ALWAYS((long)off < RSTRUCT_LEN(comptime_recv));
3999
4000     // We are going to use an encoding that takes a 4-byte immediate which
4001     // limits the offset to INT32_MAX.
4002     {
4003         uint64_t native_off = (uint64_t)off * (uint64_t)SIZEOF_VALUE;
4004         if (native_off > (uint64_t)INT32_MAX) {
4005             return YJIT_CANT_COMPILE;
4006         }
4007     }
4008
4009     // All structs from the same Struct class should have the same
4010     // length. So if our comptime_recv is embedded all runtime
4011     // structs of the same class should be as well, and the same is
4012     // true of the converse.
4013     bool embedded = FL_TEST_RAW(comptime_recv, RSTRUCT_EMBED_LEN_MASK);
4014
4015     ADD_COMMENT(cb, "struct aref");
4016
4017     x86opnd_t recv = ctx_stack_pop(ctx, 1);
4018
4019     mov(cb, REG0, recv);
4020
4021     if (embedded) {
4022         mov(cb, REG0, member_opnd_idx(REG0, struct RStruct, as.ary, off));
4023     }
4024     else {
4025         mov(cb, REG0, member_opnd(REG0, struct RStruct, as.heap.ptr));
4026         mov(cb, REG0, mem_opnd(64, REG0, SIZEOF_VALUE * off));
4027     }
4028
4029     x86opnd_t ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4030     mov(cb, ret, REG0);
4031
4032     jit_jump_to_next_insn(jit, ctx);
4033     return YJIT_END_BLOCK;
4034 }
4035
4036 static codegen_status_t
4037 gen_struct_aset(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, VALUE comptime_recv, VALUE comptime_recv_klass) {
4038     if (vm_ci_argc(ci) != 1) {
4039         return YJIT_CANT_COMPILE;
4040     }
4041
4042     const unsigned int off = cme->def->body.optimized.index;
4043
4044     // Confidence checks
4045     RUBY_ASSERT_ALWAYS(RB_TYPE_P(comptime_recv, T_STRUCT));
4046     RUBY_ASSERT_ALWAYS((long)off < RSTRUCT_LEN(comptime_recv));
4047
4048     ADD_COMMENT(cb, "struct aset");
4049
4050     x86opnd_t val = ctx_stack_pop(ctx, 1);
4051     x86opnd_t recv = ctx_stack_pop(ctx, 1);
4052
4053     mov(cb, C_ARG_REGS[0], recv);
4054     mov(cb, C_ARG_REGS[1], imm_opnd(off));
4055     mov(cb, C_ARG_REGS[2], val);
4056     call_ptr(cb, REG0, (void *)RSTRUCT_SET);
4057
4058     x86opnd_t ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4059     mov(cb, ret, RAX);
4060
4061     jit_jump_to_next_insn(jit, ctx);
4062     return YJIT_END_BLOCK;
4063 }
4064
4065 const rb_callable_method_entry_t *
4066 rb_aliased_callable_method_entry(const rb_callable_method_entry_t *me);
4067
4068 static codegen_status_t
4069 gen_send_general(jitstate_t *jit, ctx_t *ctx, struct rb_call_data *cd, rb_iseq_t *block)
4070 {
4071     // Relevant definitions:
4072     // rb_execution_context_t       : vm_core.h
4073     // invoker, cfunc logic         : method.h, vm_method.c
4074     // rb_callinfo                  : vm_callinfo.h
4075     // rb_callable_method_entry_t   : method.h
4076     // vm_call_cfunc_with_frame     : vm_insnhelper.c
4077     //
4078     // For a general overview for how the interpreter calls methods,
4079     // see vm_call_method().
4080
4081     const struct rb_callinfo *ci = cd->ci; // info about the call site
4082
4083     int32_t argc = (int32_t)vm_ci_argc(ci);
4084     ID mid = vm_ci_mid(ci);
4085
4086     // Don't JIT calls with keyword splat
4087     if (vm_ci_flag(ci) & VM_CALL_KW_SPLAT) {
4088         GEN_COUNTER_INC(cb, send_kw_splat);
4089         return YJIT_CANT_COMPILE;
4090     }
4091
4092     // Don't JIT calls that aren't simple
4093     // Note, not using VM_CALL_ARGS_SIMPLE because sometimes we pass a block.
4094     if ((vm_ci_flag(ci) & VM_CALL_ARGS_SPLAT) != 0) {
4095         GEN_COUNTER_INC(cb, send_args_splat);
4096         return YJIT_CANT_COMPILE;
4097     }
4098     if ((vm_ci_flag(ci) & VM_CALL_ARGS_BLOCKARG) != 0) {
4099         GEN_COUNTER_INC(cb, send_block_arg);
4100         return YJIT_CANT_COMPILE;
4101     }
4102
4103     // Defer compilation so we can specialize on class of receiver
4104     if (!jit_at_current_insn(jit)) {
4105         defer_compilation(jit, ctx);
4106         return YJIT_END_BLOCK;
4107     }
4108
4109     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, argc);
4110     VALUE comptime_recv_klass = CLASS_OF(comptime_recv);
4111
4112     // Guard that the receiver has the same class as the one from compile time
4113     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4114
4115     // Points to the receiver operand on the stack
4116     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
4117     insn_opnd_t recv_opnd = OPND_STACK(argc);
4118     mov(cb, REG0, recv);
4119     if (!jit_guard_known_klass(jit, ctx, comptime_recv_klass, recv_opnd, comptime_recv, SEND_MAX_DEPTH, side_exit)) {
4120         return YJIT_CANT_COMPILE;
4121     }
4122
4123     // Do method lookup
4124     const rb_callable_method_entry_t *cme = rb_callable_method_entry(comptime_recv_klass, mid);
4125     if (!cme) {
4126         // TODO: counter
4127         return YJIT_CANT_COMPILE;
4128     }
4129
4130     switch (METHOD_ENTRY_VISI(cme)) {
4131       case METHOD_VISI_PUBLIC:
4132         // Can always call public methods
4133         break;
4134       case METHOD_VISI_PRIVATE:
4135         if (!(vm_ci_flag(ci) & VM_CALL_FCALL)) {
4136             // Can only call private methods with FCALL callsites.
4137             // (at the moment they are callsites without a receiver or an explicit `self` receiver)
4138             return YJIT_CANT_COMPILE;
4139         }
4140         break;
4141       case METHOD_VISI_PROTECTED:
4142         jit_protected_callee_ancestry_guard(jit, cb, cme, side_exit);
4143         break;
4144       case METHOD_VISI_UNDEF:
4145         RUBY_ASSERT(false && "cmes should always have a visibility");
4146         break;
4147     }
4148
4149     // Register block for invalidation
4150     RUBY_ASSERT(cme->called_id == mid);
4151     assume_method_lookup_stable(comptime_recv_klass, cme, jit);
4152
4153     // To handle the aliased method case (VM_METHOD_TYPE_ALIAS)
4154     while (true) {
4155         // switch on the method type
4156         switch (cme->def->type) {
4157           case VM_METHOD_TYPE_ISEQ:
4158             return gen_send_iseq(jit, ctx, ci, cme, block, argc);
4159           case VM_METHOD_TYPE_CFUNC:
4160             return gen_send_cfunc(jit, ctx, ci, cme, block, argc, &comptime_recv_klass);
4161           case VM_METHOD_TYPE_IVAR:
4162             if (argc != 0) {
4163                 // Argument count mismatch. Getters take no arguments.
4164                 GEN_COUNTER_INC(cb, send_getter_arity);
4165                 return YJIT_CANT_COMPILE;
4166             }
4167             if (c_method_tracing_currently_enabled(jit)) {
4168                 // Can't generate code for firing c_call and c_return events
4169                 // :attr-tracing:
4170                 // Handling the C method tracing events for attr_accessor
4171                 // methods is easier than regular C methods as we know the
4172                 // "method" we are calling into never enables those tracing
4173                 // events. Once global invalidation runs, the code for the
4174                 // attr_accessor is invalidated and we exit at the closest
4175                 // instruction boundary which is always outside of the body of
4176                 // the attr_accessor code.
4177                 GEN_COUNTER_INC(cb, send_cfunc_tracing);
4178                 return YJIT_CANT_COMPILE;
4179             }
4180
4181             mov(cb, REG0, recv);
4182
4183             ID ivar_name = cme->def->body.attr.id;
4184             return gen_get_ivar(jit, ctx, SEND_MAX_DEPTH, comptime_recv, ivar_name, recv_opnd, side_exit);
4185           case VM_METHOD_TYPE_ATTRSET:
4186             if ((vm_ci_flag(ci) & VM_CALL_KWARG) != 0) {
4187                 GEN_COUNTER_INC(cb, send_attrset_kwargs);
4188                 return YJIT_CANT_COMPILE;
4189             }
4190             else if (argc != 1 || !RB_TYPE_P(comptime_recv, T_OBJECT)) {
4191                 GEN_COUNTER_INC(cb, send_ivar_set_method);
4192                 return YJIT_CANT_COMPILE;
4193             }
4194             else if (c_method_tracing_currently_enabled(jit)) {
4195                 // Can't generate code for firing c_call and c_return events
4196                 // See :attr-tracing:
4197                 GEN_COUNTER_INC(cb, send_cfunc_tracing);
4198                 return YJIT_CANT_COMPILE;
4199             }
4200             else {
4201                 ID ivar_name = cme->def->body.attr.id;
4202                 return gen_set_ivar(jit, ctx, comptime_recv, comptime_recv_klass, ivar_name);
4203             }
4204           // Block method, e.g. define_method(:foo) { :my_block }
4205           case VM_METHOD_TYPE_BMETHOD:
4206             GEN_COUNTER_INC(cb, send_bmethod);
4207             return YJIT_CANT_COMPILE;
4208           case VM_METHOD_TYPE_ZSUPER:
4209             GEN_COUNTER_INC(cb, send_zsuper_method);
4210             return YJIT_CANT_COMPILE;
4211           case VM_METHOD_TYPE_ALIAS: {
4212             // Retrieve the alised method and re-enter the switch
4213             cme = rb_aliased_callable_method_entry(cme);
4214             continue;
4215           }
4216           case VM_METHOD_TYPE_UNDEF:
4217             GEN_COUNTER_INC(cb, send_undef_method);
4218             return YJIT_CANT_COMPILE;
4219           case VM_METHOD_TYPE_NOTIMPLEMENTED:
4220             GEN_COUNTER_INC(cb, send_not_implemented_method);
4221             return YJIT_CANT_COMPILE;
4222           // Send family of methods, e.g. call/apply
4223           case VM_METHOD_TYPE_OPTIMIZED:
4224             switch (cme->def->body.optimized.type) {
4225               case OPTIMIZED_METHOD_TYPE_SEND:
4226                 GEN_COUNTER_INC(cb, send_optimized_method_send);
4227                 return YJIT_CANT_COMPILE;
4228               case OPTIMIZED_METHOD_TYPE_CALL:
4229                 GEN_COUNTER_INC(cb, send_optimized_method_call);
4230                 return YJIT_CANT_COMPILE;
4231               case OPTIMIZED_METHOD_TYPE_BLOCK_CALL:
4232                 GEN_COUNTER_INC(cb, send_optimized_method_block_call);
4233                 return YJIT_CANT_COMPILE;
4234               case OPTIMIZED_METHOD_TYPE_STRUCT_AREF:
4235                 return gen_struct_aref(jit, ctx, ci, cme, comptime_recv, comptime_recv_klass);
4236               case OPTIMIZED_METHOD_TYPE_STRUCT_ASET:
4237                 return gen_struct_aset(jit, ctx, ci, cme, comptime_recv, comptime_recv_klass);
4238               default:
4239                 rb_bug("unknown optimized method type (%d)", cme->def->body.optimized.type);
4240                 UNREACHABLE_RETURN(YJIT_CANT_COMPILE);
4241             }
4242           case VM_METHOD_TYPE_MISSING:
4243             GEN_COUNTER_INC(cb, send_missing_method);
4244             return YJIT_CANT_COMPILE;
4245           case VM_METHOD_TYPE_REFINED:
4246             GEN_COUNTER_INC(cb, send_refined_method);
4247             return YJIT_CANT_COMPILE;
4248             // no default case so compiler issues a warning if this is not exhaustive
4249         }
4250
4251         // Unreachable
4252         RUBY_ASSERT(false);
4253     }
4254 }
4255
4256 static codegen_status_t
4257 gen_opt_send_without_block(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4258 {
4259     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4260     return gen_send_general(jit, ctx, cd, NULL);
4261 }
4262
4263 static codegen_status_t
4264 gen_send(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4265 {
4266     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4267     rb_iseq_t *block = (rb_iseq_t *)jit_get_arg(jit, 1);
4268     return gen_send_general(jit, ctx, cd, block);
4269 }
4270
4271 static codegen_status_t
4272 gen_invokesuper(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4273 {
4274     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4275     rb_iseq_t *block = (rb_iseq_t *)jit_get_arg(jit, 1);
4276
4277     // Defer compilation so we can specialize on class of receiver
4278     if (!jit_at_current_insn(jit)) {
4279         defer_compilation(jit, ctx);
4280         return YJIT_END_BLOCK;
4281     }
4282
4283     const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(jit->ec->cfp);
4284     if (!me) {
4285         return YJIT_CANT_COMPILE;
4286     }
4287
4288     // FIXME: We should track and invalidate this block when this cme is invalidated
4289     VALUE current_defined_class = me->defined_class;
4290     ID mid = me->def->original_id;
4291
4292     if (me != rb_callable_method_entry(current_defined_class, me->called_id)) {
4293         // Though we likely could generate this call, as we are only concerned
4294         // with the method entry remaining valid, assume_method_lookup_stable
4295         // below requires that the method lookup matches as well
4296         return YJIT_CANT_COMPILE;
4297     }
4298
4299     // vm_search_normal_superclass
4300     if (BUILTIN_TYPE(current_defined_class) == T_ICLASS && FL_TEST_RAW(RBASIC(current_defined_class)->klass, RMODULE_IS_REFINEMENT)) {
4301         return YJIT_CANT_COMPILE;
4302     }
4303     VALUE comptime_superclass = RCLASS_SUPER(RCLASS_ORIGIN(current_defined_class));
4304
4305     const struct rb_callinfo *ci = cd->ci;
4306     int32_t argc = (int32_t)vm_ci_argc(ci);
4307
4308     // Don't JIT calls that aren't simple
4309     // Note, not using VM_CALL_ARGS_SIMPLE because sometimes we pass a block.
4310     if ((vm_ci_flag(ci) & VM_CALL_ARGS_SPLAT) != 0) {
4311         GEN_COUNTER_INC(cb, send_args_splat);
4312         return YJIT_CANT_COMPILE;
4313     }
4314     if ((vm_ci_flag(ci) & VM_CALL_KWARG) != 0) {
4315         GEN_COUNTER_INC(cb, send_keywords);
4316         return YJIT_CANT_COMPILE;
4317     }
4318     if ((vm_ci_flag(ci) & VM_CALL_KW_SPLAT) != 0) {
4319         GEN_COUNTER_INC(cb, send_kw_splat);
4320         return YJIT_CANT_COMPILE;
4321     }
4322     if ((vm_ci_flag(ci) & VM_CALL_ARGS_BLOCKARG) != 0) {
4323         GEN_COUNTER_INC(cb, send_block_arg);
4324         return YJIT_CANT_COMPILE;
4325     }
4326
4327     // Ensure we haven't rebound this method onto an incompatible class.
4328     // In the interpreter we try to avoid making this check by performing some
4329     // cheaper calculations first, but since we specialize on the method entry
4330     // and so only have to do this once at compile time this is fine to always
4331     // check and side exit.
4332     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, argc);
4333     if (!rb_obj_is_kind_of(comptime_recv, current_defined_class)) {
4334         return YJIT_CANT_COMPILE;
4335     }
4336
4337     // Do method lookup
4338     const rb_callable_method_entry_t *cme = rb_callable_method_entry(comptime_superclass, mid);
4339
4340     if (!cme) {
4341         return YJIT_CANT_COMPILE;
4342     }
4343
4344     // Check that we'll be able to write this method dispatch before generating checks
4345     switch (cme->def->type) {
4346       case VM_METHOD_TYPE_ISEQ:
4347       case VM_METHOD_TYPE_CFUNC:
4348         break;
4349       default:
4350         // others unimplemented
4351         return YJIT_CANT_COMPILE;
4352     }
4353
4354     // Guard that the receiver has the same class as the one from compile time
4355     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4356
4357     if (jit->ec->cfp->ep[VM_ENV_DATA_INDEX_ME_CREF] != (VALUE)me) {
4358         // This will be the case for super within a block
4359         return YJIT_CANT_COMPILE;
4360     }
4361
4362     ADD_COMMENT(cb, "guard known me");
4363     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, ep));
4364     x86opnd_t ep_me_opnd = mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_ME_CREF);
4365     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)me);
4366     cmp(cb, ep_me_opnd, REG1);
4367     jne_ptr(cb, COUNTED_EXIT(jit, side_exit, invokesuper_me_changed));
4368
4369     if (!block) {
4370         // Guard no block passed
4371         // rb_vm_frame_block_handler(GET_EC()->cfp) == VM_BLOCK_HANDLER_NONE
4372         // note, we assume VM_ASSERT(VM_ENV_LOCAL_P(ep))
4373         //
4374         // TODO: this could properly forward the current block handler, but
4375         // would require changes to gen_send_*
4376         ADD_COMMENT(cb, "guard no block given");
4377         // EP is in REG0 from above
4378         x86opnd_t ep_specval_opnd = mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_SPECVAL);
4379         cmp(cb, ep_specval_opnd, imm_opnd(VM_BLOCK_HANDLER_NONE));
4380         jne_ptr(cb, COUNTED_EXIT(jit, side_exit, invokesuper_block));
4381     }
4382
4383     // Points to the receiver operand on the stack
4384     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
4385     mov(cb, REG0, recv);
4386
4387     // We need to assume that both our current method entry and the super
4388     // method entry we invoke remain stable
4389     assume_method_lookup_stable(current_defined_class, me, jit);
4390     assume_method_lookup_stable(comptime_superclass, cme, jit);
4391
4392     // Method calls may corrupt types
4393     ctx_clear_local_types(ctx);
4394
4395     switch (cme->def->type) {
4396       case VM_METHOD_TYPE_ISEQ:
4397         return gen_send_iseq(jit, ctx, ci, cme, block, argc);
4398       case VM_METHOD_TYPE_CFUNC:
4399         return gen_send_cfunc(jit, ctx, ci, cme, block, argc, NULL);
4400       default:
4401         break;
4402     }
4403
4404     RUBY_ASSERT_ALWAYS(false);
4405 }
4406
4407 static codegen_status_t
4408 gen_leave(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4409 {
4410     // Only the return value should be on the stack
4411     RUBY_ASSERT(ctx->stack_size == 1);
4412
4413     // Create a side-exit to fall back to the interpreter
4414     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4415
4416     // Load environment pointer EP from CFP
4417     mov(cb, REG1, member_opnd(REG_CFP, rb_control_frame_t, ep));
4418
4419     // Check for interrupts
4420     ADD_COMMENT(cb, "check for interrupts");
4421     yjit_check_ints(cb, COUNTED_EXIT(jit, side_exit, leave_se_interrupt));
4422
4423     // Load the return value
4424     mov(cb, REG0, ctx_stack_pop(ctx, 1));
4425
4426     // Pop the current frame (ec->cfp++)
4427     // Note: the return PC is already in the previous CFP
4428     add(cb, REG_CFP, imm_opnd(sizeof(rb_control_frame_t)));
4429     mov(cb, member_opnd(REG_EC, rb_execution_context_t, cfp), REG_CFP);
4430
4431     // Reload REG_SP for the caller and write the return value.
4432     // Top of the stack is REG_SP[0] since the caller has sp_offset=1.
4433     mov(cb, REG_SP, member_opnd(REG_CFP, rb_control_frame_t, sp));
4434     mov(cb, mem_opnd(64, REG_SP, 0), REG0);
4435
4436     // Jump to the JIT return address on the frame that was just popped
4437     const int32_t offset_to_jit_return = -((int32_t)sizeof(rb_control_frame_t)) + (int32_t)offsetof(rb_control_frame_t, jit_return);
4438     jmp_rm(cb, mem_opnd(64, REG_CFP, offset_to_jit_return));
4439
4440     return YJIT_END_BLOCK;
4441 }
4442
4443 RUBY_EXTERN rb_serial_t ruby_vm_global_constant_state;
4444
4445 static codegen_status_t
4446 gen_getglobal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4447 {
4448     ID gid = jit_get_arg(jit, 0);
4449
4450     // Save the PC and SP because we might make a Ruby call for warning
4451     jit_prepare_routine_call(jit, ctx, REG0);
4452
4453     mov(cb, C_ARG_REGS[0], imm_opnd(gid));
4454
4455     call_ptr(cb, REG0, (void *)&rb_gvar_get);
4456
4457     x86opnd_t top = ctx_stack_push(ctx, TYPE_UNKNOWN);
4458     mov(cb, top, RAX);
4459
4460     return YJIT_KEEP_COMPILING;
4461 }
4462
4463 static codegen_status_t
4464 gen_setglobal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4465 {
4466     ID gid = jit_get_arg(jit, 0);
4467
4468     // Save the PC and SP because we might make a Ruby call for
4469     // Kernel#trace_var
4470     jit_prepare_routine_call(jit, ctx, REG0);
4471
4472     mov(cb, C_ARG_REGS[0], imm_opnd(gid));
4473
4474     x86opnd_t val = ctx_stack_pop(ctx, 1);
4475
4476     mov(cb, C_ARG_REGS[1], val);
4477
4478     call_ptr(cb, REG0, (void *)&rb_gvar_set);
4479
4480     return YJIT_KEEP_COMPILING;
4481 }
4482
4483 static codegen_status_t
4484 gen_anytostring(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4485 {
4486     // Might allocate in rb_obj_as_string_result().
4487     jit_prepare_routine_call(jit, ctx, REG0);
4488
4489     x86opnd_t str = ctx_stack_pop(ctx, 1);
4490     x86opnd_t val = ctx_stack_pop(ctx, 1);
4491
4492     mov(cb, C_ARG_REGS[0], str);
4493     mov(cb, C_ARG_REGS[1], val);
4494
4495     call_ptr(cb, REG0, (void *)&rb_obj_as_string_result);
4496
4497     // Push the return value
4498     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
4499     mov(cb, stack_ret, RAX);
4500
4501     return YJIT_KEEP_COMPILING;
4502 }
4503
4504 static codegen_status_t
4505 gen_objtostring(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4506 {
4507     if (!jit_at_current_insn(jit)) {
4508         defer_compilation(jit, ctx);
4509         return YJIT_END_BLOCK;
4510     }
4511
4512     x86opnd_t recv = ctx_stack_opnd(ctx, 0);
4513     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, 0);
4514
4515     if (RB_TYPE_P(comptime_recv, T_STRING)) {
4516         uint8_t *side_exit = yjit_side_exit(jit, ctx);
4517
4518         mov(cb, REG0, recv);
4519         jit_guard_known_klass(jit, ctx, CLASS_OF(comptime_recv), OPND_STACK(0), comptime_recv, SEND_MAX_DEPTH, side_exit);
4520         // No work needed. The string value is already on the top of the stack.
4521         return YJIT_KEEP_COMPILING;
4522     }
4523     else {
4524         struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4525         return gen_send_general(jit, ctx, cd, NULL);
4526     }
4527 }
4528
4529 static codegen_status_t
4530 gen_toregexp(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4531 {
4532     rb_num_t opt = jit_get_arg(jit, 0);
4533     rb_num_t cnt = jit_get_arg(jit, 1);
4534
4535     // Save the PC and SP because this allocates an object and could
4536     // raise an exception.
4537     jit_prepare_routine_call(jit, ctx, REG0);
4538
4539     x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(int32_t)(sizeof(VALUE) * (uint32_t)cnt));
4540     ctx_stack_pop(ctx, cnt);
4541
4542     mov(cb, C_ARG_REGS[0], imm_opnd(0));
4543     mov(cb, C_ARG_REGS[1], imm_opnd(cnt));
4544     lea(cb, C_ARG_REGS[2], values_ptr);
4545     call_ptr(cb, REG0, (void *)&rb_ary_tmp_new_from_values);
4546
4547     // Save the array so we can clear it later
4548     push(cb, RAX);
4549     push(cb, RAX); // Alignment
4550     mov(cb, C_ARG_REGS[0], RAX);
4551     mov(cb, C_ARG_REGS[1], imm_opnd(opt));
4552     call_ptr(cb, REG0, (void *)&rb_reg_new_ary);
4553
4554     // The actual regex is in RAX now.  Pop the temp array from
4555     // rb_ary_tmp_new_from_values into C arg regs so we can clear it
4556     pop(cb, REG1); // Alignment
4557     pop(cb, C_ARG_REGS[0]);
4558
4559     // The value we want to push on the stack is in RAX right now
4560     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4561     mov(cb, stack_ret, RAX);
4562
4563     // Clear the temp array.
4564     call_ptr(cb, REG0, (void *)&rb_ary_clear);
4565
4566     return YJIT_KEEP_COMPILING;
4567 }
4568
4569 static codegen_status_t
4570 gen_intern(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4571 {
4572     // Save the PC and SP because we might allocate
4573     jit_prepare_routine_call(jit, ctx, REG0);
4574
4575     x86opnd_t str = ctx_stack_pop(ctx, 1);
4576
4577     mov(cb, C_ARG_REGS[0], str);
4578
4579     call_ptr(cb, REG0, (void *)&rb_str_intern);
4580
4581     // Push the return value
4582     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4583     mov(cb, stack_ret, RAX);
4584
4585     return YJIT_KEEP_COMPILING;
4586 }
4587
4588 static codegen_status_t
4589 gen_getspecial(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4590 {
4591     // This takes two arguments, key and type
4592     // key is only used when type == 0
4593     // A non-zero type determines which type of backref to fetch
4594     //rb_num_t key = jit_get_arg(jit, 0);
4595     rb_num_t type = jit_get_arg(jit, 1);
4596
4597     if (type == 0) {
4598         // not yet implemented
4599         return YJIT_CANT_COMPILE;
4600     }
4601     else if (type & 0x01) {
4602         // Fetch a "special" backref based on a char encoded by shifting by 1
4603
4604         // Can raise if matchdata uninitialized
4605         jit_prepare_routine_call(jit, ctx, REG0);
4606
4607         // call rb_backref_get()
4608         ADD_COMMENT(cb, "rb_backref_get");
4609         call_ptr(cb, REG0, (void *)rb_backref_get);
4610         mov(cb, C_ARG_REGS[0], RAX);
4611
4612         switch (type >> 1) {
4613           case '&':
4614             ADD_COMMENT(cb, "rb_reg_last_match");
4615             call_ptr(cb, REG0, (void *)rb_reg_last_match);
4616             break;
4617           case '`':
4618             ADD_COMMENT(cb, "rb_reg_match_pre");
4619             call_ptr(cb, REG0, (void *)rb_reg_match_pre);
4620             break;
4621           case '\'':
4622             ADD_COMMENT(cb, "rb_reg_match_post");
4623             call_ptr(cb, REG0, (void *)rb_reg_match_post);
4624             break;
4625           case '+':
4626             ADD_COMMENT(cb, "rb_reg_match_last");
4627             call_ptr(cb, REG0, (void *)rb_reg_match_last);
4628             break;
4629           default:
4630             rb_bug("invalid back-ref");
4631         }
4632
4633         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4634         mov(cb, stack_ret, RAX);
4635
4636         return YJIT_KEEP_COMPILING;
4637     }
4638     else {
4639         // Fetch the N-th match from the last backref based on type shifted by 1
4640
4641         // Can raise if matchdata uninitialized
4642         jit_prepare_routine_call(jit, ctx, REG0);
4643
4644         // call rb_backref_get()
4645         ADD_COMMENT(cb, "rb_backref_get");
4646         call_ptr(cb, REG0, (void *)rb_backref_get);
4647
4648         // rb_reg_nth_match((int)(type >> 1), backref);
4649         ADD_COMMENT(cb, "rb_reg_nth_match");
4650         mov(cb, C_ARG_REGS[0], imm_opnd(type >> 1));
4651         mov(cb, C_ARG_REGS[1], RAX);
4652         call_ptr(cb, REG0, (void *)rb_reg_nth_match);
4653
4654         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4655         mov(cb, stack_ret, RAX);
4656
4657         return YJIT_KEEP_COMPILING;
4658     }
4659 }
4660
4661 VALUE
4662 rb_vm_getclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *cfp, ID id, ICVARC ic);
4663
4664 static codegen_status_t
4665 gen_getclassvariable(jitstate_t* jit, ctx_t* ctx, codeblock_t* cb)
4666 {
4667     // rb_vm_getclassvariable can raise exceptions.
4668     jit_prepare_routine_call(jit, ctx, REG0);
4669
4670     mov(cb, C_ARG_REGS[0], member_opnd(REG_CFP, rb_control_frame_t, iseq));
4671     mov(cb, C_ARG_REGS[1], REG_CFP);
4672     mov(cb, C_ARG_REGS[2], imm_opnd(jit_get_arg(jit, 0)));
4673     mov(cb, C_ARG_REGS[3], imm_opnd(jit_get_arg(jit, 1)));
4674
4675     call_ptr(cb, REG0, (void *)rb_vm_getclassvariable);
4676
4677     x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN);
4678     mov(cb, stack_top, RAX);
4679
4680     return YJIT_KEEP_COMPILING;
4681 }
4682
4683 VALUE
4684 rb_vm_setclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *cfp, ID id, VALUE val, ICVARC ic);
4685
4686 static codegen_status_t
4687 gen_setclassvariable(jitstate_t* jit, ctx_t* ctx, codeblock_t* cb)
4688 {
4689     // rb_vm_setclassvariable can raise exceptions.
4690     jit_prepare_routine_call(jit, ctx, REG0);
4691
4692     mov(cb, C_ARG_REGS[0], member_opnd(REG_CFP, rb_control_frame_t, iseq));
4693     mov(cb, C_ARG_REGS[1], REG_CFP);
4694     mov(cb, C_ARG_REGS[2], imm_opnd(jit_get_arg(jit, 0)));
4695     mov(cb, C_ARG_REGS[3], ctx_stack_pop(ctx, 1));
4696     mov(cb, C_ARG_REGS[4], imm_opnd(jit_get_arg(jit, 1)));
4697
4698     call_ptr(cb, REG0, (void *)rb_vm_setclassvariable);
4699
4700     return YJIT_KEEP_COMPILING;
4701 }
4702
4703 static codegen_status_t
4704 gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4705 {
4706     VALUE jump_offset = jit_get_arg(jit, 0);
4707     VALUE const_cache_as_value = jit_get_arg(jit, 1);
4708     IC ic = (IC)const_cache_as_value;
4709
4710     // See vm_ic_hit_p(). The same conditions are checked in yjit_constant_ic_update().
4711     struct iseq_inline_constant_cache_entry *ice = ic->entry;
4712     if (!ice || // cache not filled
4713         GET_IC_SERIAL(ice) != ruby_vm_global_constant_state /* cache out of date */) {
4714         // In these cases, leave a block that unconditionally side exits
4715         // for the interpreter to invalidate.
4716         return YJIT_CANT_COMPILE;
4717     }
4718
4719     // Make sure there is an exit for this block as the interpreter might want
4720     // to invalidate this block from yjit_constant_ic_update().
4721     jit_ensure_block_entry_exit(jit);
4722
4723     if (ice->ic_cref) {
4724         // Cache is keyed on a certain lexical scope. Use the interpreter's cache.
4725         uint8_t *side_exit = yjit_side_exit(jit, ctx);
4726
4727         // Call function to verify the cache. It doesn't allocate or call methods.
4728         bool rb_vm_ic_hit_p(IC ic, const VALUE *reg_ep);
4729         mov(cb, C_ARG_REGS[0], const_ptr_opnd((void *)ic));
4730         mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, ep));
4731         call_ptr(cb, REG0, (void *)rb_vm_ic_hit_p);
4732
4733         // Check the result. _Bool is one byte in SysV.
4734         test(cb, AL, AL);
4735         jz_ptr(cb, COUNTED_EXIT(jit, side_exit, opt_getinlinecache_miss));
4736
4737         // Push ic->entry->value
4738         mov(cb, REG0, const_ptr_opnd((void *)ic));
4739         mov(cb, REG0, member_opnd(REG0, struct iseq_inline_constant_cache, entry));
4740         x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN);
4741         mov(cb, REG0, member_opnd(REG0, struct iseq_inline_constant_cache_entry, value));
4742         mov(cb, stack_top, REG0);
4743     }
4744     else {
4745         // Optimize for single ractor mode.
4746         // FIXME: This leaks when st_insert raises NoMemoryError
4747         if (!assume_single_ractor_mode(jit)) return YJIT_CANT_COMPILE;
4748
4749         // Invalidate output code on any and all constant writes
4750         // FIXME: This leaks when st_insert raises NoMemoryError
4751         assume_stable_global_constant_state(jit);
4752
4753         jit_putobject(jit, ctx, ice->value);
4754     }
4755
4756     // Jump over the code for filling the cache
4757     uint32_t jump_idx = jit_next_insn_idx(jit) + (int32_t)jump_offset;
4758     gen_direct_jump(
4759         jit,
4760         ctx,
4761         (blockid_t){ .iseq = jit->iseq, .idx = jump_idx }
4762     );
4763
4764     return YJIT_END_BLOCK;
4765 }
4766
4767 // Push the explicit block parameter onto the temporary stack. Part of the
4768 // interpreter's scheme for avoiding Proc allocations when delegating
4769 // explicit block parameters.
4770 static codegen_status_t
4771 gen_getblockparamproxy(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4772 {
4773     // A mirror of the interpreter code. Checking for the case
4774     // where it's pushing rb_block_param_proxy.
4775     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4776
4777     // EP level
4778     uint32_t level = (uint32_t)jit_get_arg(jit, 1);
4779
4780     // Load environment pointer EP from CFP
4781     gen_get_ep(cb, REG0, level);
4782
4783     // Bail when VM_ENV_FLAGS(ep, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM) is non zero
4784     test(cb, mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_FLAGS), imm_opnd(VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM));
4785     jnz_ptr(cb, COUNTED_EXIT(jit, side_exit, gbpp_block_param_modified));
4786
4787     // Load the block handler for the current frame
4788     // note, VM_ASSERT(VM_ENV_LOCAL_P(ep))
4789     mov(cb, REG0, mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_SPECVAL));
4790
4791     // Block handler is a tagged pointer. Look at the tag. 0x03 is from VM_BH_ISEQ_BLOCK_P().
4792     and(cb, REG0_8, imm_opnd(0x3));
4793
4794     // Bail unless VM_BH_ISEQ_BLOCK_P(bh). This also checks for null.
4795     cmp(cb, REG0_8, imm_opnd(0x1));
4796     jnz_ptr(cb, COUNTED_EXIT(jit, side_exit, gbpp_block_handler_not_iseq));
4797
4798     // Push rb_block_param_proxy. It's a root, so no need to use jit_mov_gc_ptr.
4799     mov(cb, REG0, const_ptr_opnd((void *)rb_block_param_proxy));
4800     RUBY_ASSERT(!SPECIAL_CONST_P(rb_block_param_proxy));
4801     x86opnd_t top = ctx_stack_push(ctx, TYPE_HEAP);
4802     mov(cb, top, REG0);
4803
4804     return YJIT_KEEP_COMPILING;
4805 }
4806
4807 static codegen_status_t
4808 gen_invokebuiltin(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4809 {
4810     const struct rb_builtin_function *bf = (struct rb_builtin_function *)jit_get_arg(jit, 0);
4811
4812     // ec, self, and arguments
4813     if (bf->argc + 2 > NUM_C_ARG_REGS) {
4814         return YJIT_CANT_COMPILE;
4815     }
4816
4817     // If the calls don't allocate, do they need up to date PC, SP?
4818     jit_prepare_routine_call(jit, ctx, REG0);
4819
4820     // Call the builtin func (ec, recv, arg1, arg2, ...)
4821     mov(cb, C_ARG_REGS[0], REG_EC);
4822     mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, self));
4823
4824     // Copy arguments from locals
4825     for (int32_t i = 0; i < bf->argc; i++) {
4826         x86opnd_t stack_opnd = ctx_stack_opnd(ctx, bf->argc - i - 1);
4827         x86opnd_t c_arg_reg = C_ARG_REGS[2 + i];
4828         mov(cb, c_arg_reg, stack_opnd);
4829     }
4830
4831     call_ptr(cb, REG0, (void *)bf->func_ptr);
4832
4833     // Push the return value
4834     ctx_stack_pop(ctx, bf->argc);
4835     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4836     mov(cb, stack_ret, RAX);
4837
4838     return YJIT_KEEP_COMPILING;
4839 }
4840
4841 // opt_invokebuiltin_delegate calls a builtin function, like
4842 // invokebuiltin does, but instead of taking arguments from the top of the
4843 // stack uses the argument locals (and self) from the current method.
4844 static codegen_status_t
4845 gen_opt_invokebuiltin_delegate(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4846 {
4847     const struct rb_builtin_function *bf = (struct rb_builtin_function *)jit_get_arg(jit, 0);
4848     int32_t start_index = (int32_t)jit_get_arg(jit, 1);
4849
4850     // ec, self, and arguments
4851     if (bf->argc + 2 > NUM_C_ARG_REGS) {
4852         return YJIT_CANT_COMPILE;
4853     }
4854
4855     // If the calls don't allocate, do they need up to date PC, SP?
4856     jit_prepare_routine_call(jit, ctx, REG0);
4857
4858     if (bf->argc > 0) {
4859         // Load environment pointer EP from CFP
4860         mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, ep));
4861     }
4862
4863     // Call the builtin func (ec, recv, arg1, arg2, ...)
4864     mov(cb, C_ARG_REGS[0], REG_EC);
4865     mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, self));
4866
4867     // Copy arguments from locals
4868     for (int32_t i = 0; i < bf->argc; i++) {
4869         const int32_t offs = start_index + i - jit->iseq->body->local_table_size - VM_ENV_DATA_SIZE + 1;
4870         x86opnd_t local_opnd = mem_opnd(64, REG0, offs * SIZEOF_VALUE);
4871         x86opnd_t c_arg_reg = C_ARG_REGS[i + 2];
4872         mov(cb, c_arg_reg, local_opnd);
4873     }
4874     call_ptr(cb, REG0, (void *)bf->func_ptr);
4875
4876     // Push the return value
4877     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4878     mov(cb, stack_ret, RAX);
4879
4880     return YJIT_KEEP_COMPILING;
4881 }
4882
4883 static int tracing_invalidate_all_i(void *vstart, void *vend, size_t stride, void *data);
4884 static void invalidate_all_blocks_for_tracing(const rb_iseq_t *iseq);
4885
4886 // Invalidate all generated code and patch C method return code to contain
4887 // logic for firing the c_return TracePoint event. Once rb_vm_barrier()
4888 // returns, all other ractors are pausing inside RB_VM_LOCK_ENTER(), which
4889 // means they are inside a C routine. If there are any generated code on-stack,
4890 // they are waiting for a return from a C routine. For every routine call, we
4891 // patch in an exit after the body of the containing VM instruction. This makes
4892 // it so all the invalidated code exit as soon as execution logically reaches
4893 // the next VM instruction. The interpreter takes care of firing the tracing
4894 // event if it so happens that the next VM instruction has one attached.
4895 //
4896 // The c_return event needs special handling as our codegen never outputs code
4897 // that contains tracing logic. If we let the normal output code run until the
4898 // start of the next VM instruction by relying on the patching scheme above, we
4899 // would fail to fire the c_return event. The interpreter doesn't fire the
4900 // event at an instruction boundary, so simply exiting to the interpreter isn't
4901 // enough. To handle it, we patch in the full logic at the return address. See
4902 // full_cfunc_return().
4903 //
4904 // In addition to patching, we prevent future entries into invalidated code by
4905 // removing all live blocks from their iseq.
4906 void
4907 rb_yjit_tracing_invalidate_all(void)
4908 {
4909     if (!rb_yjit_enabled_p()) return;
4910
4911     // Stop other ractors since we are going to patch machine code.
4912     RB_VM_LOCK_ENTER();
4913     rb_vm_barrier();
4914
4915     // Make it so all live block versions are no longer valid branch targets
4916     rb_objspace_each_objects(tracing_invalidate_all_i, NULL);
4917
4918     // Apply patches
4919     const uint32_t old_pos = cb->write_pos;
4920     rb_darray_for(global_inval_patches, patch_idx) {
4921         struct codepage_patch patch = rb_darray_get(global_inval_patches, patch_idx);
4922         cb_set_pos(cb, patch.inline_patch_pos);
4923         uint8_t *jump_target = cb_get_ptr(ocb, patch.outlined_target_pos);
4924         jmp_ptr(cb, jump_target);
4925     }
4926     cb_set_pos(cb, old_pos);
4927
4928     // Freeze invalidated part of the codepage. We only want to wait for
4929     // running instances of the code to exit from now on, so we shouldn't
4930     // change the code. There could be other ractors sleeping in
4931     // branch_stub_hit(), for example. We could harden this by changing memory
4932     // protection on the frozen range.
4933     RUBY_ASSERT_ALWAYS(yjit_codepage_frozen_bytes <= old_pos && "frozen bytes should increase monotonically");
4934     yjit_codepage_frozen_bytes = old_pos;
4935
4936     cb_mark_all_executable(ocb);
4937     cb_mark_all_executable(cb);
4938     RB_VM_LOCK_LEAVE();
4939 }
4940
4941 static int
4942 tracing_invalidate_all_i(void *vstart, void *vend, size_t stride, void *data)
4943 {
4944     VALUE v = (VALUE)vstart;
4945     for (; v != (VALUE)vend; v += stride) {
4946         void *ptr = asan_poisoned_object_p(v);
4947         asan_unpoison_object(v, false);
4948
4949         if (rb_obj_is_iseq(v)) {
4950             rb_iseq_t *iseq = (rb_iseq_t *)v;
4951             invalidate_all_blocks_for_tracing(iseq);
4952         }
4953
4954         asan_poison_object_if(ptr, v);
4955     }
4956     return 0;
4957 }
4958
4959 static void
4960 invalidate_all_blocks_for_tracing(const rb_iseq_t *iseq)
4961 {
4962     struct rb_iseq_constant_body *body = iseq->body;
4963     if (!body) return; // iseq yet to be initialized
4964
4965     ASSERT_vm_locking();
4966
4967     // Empty all blocks on the iseq so we don't compile new blocks that jump to the
4968     // invalidted region.
4969     // TODO Leaking the blocks for now since we might have situations where
4970     // a different ractor is waiting in branch_stub_hit(). If we free the block
4971     // that ractor can wake up with a dangling block.
4972     rb_darray_for(body->yjit_blocks, version_array_idx) {
4973         rb_yjit_block_array_t version_array = rb_darray_get(body->yjit_blocks, version_array_idx);
4974         rb_darray_for(version_array, version_idx) {
4975             // Stop listening for invalidation events like basic operation redefinition.
4976             block_t *block = rb_darray_get(version_array, version_idx);
4977             yjit_unlink_method_lookup_dependency(block);
4978             yjit_block_assumptions_free(block);
4979         }
4980         rb_darray_free(version_array);
4981     }
4982     rb_darray_free(body->yjit_blocks);
4983     body->yjit_blocks = NULL;
4984
4985 #if USE_MJIT
4986     // Reset output code entry point
4987     body->jit_func = NULL;
4988 #endif
4989 }
4990
4991 static void
4992 yjit_reg_op(int opcode, codegen_fn gen_fn)
4993 {
4994     RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE);
4995     // Check that the op wasn't previously registered
4996     RUBY_ASSERT(gen_fns[opcode] == NULL);
4997
4998     gen_fns[opcode] = gen_fn;
4999 }
5000
5001 void
5002 yjit_init_codegen(void)
5003 {
5004     // Initialize the code blocks
5005     uint32_t mem_size = rb_yjit_opts.exec_mem_size * 1024 * 1024;
5006     uint8_t *mem_block = alloc_exec_mem(mem_size);
5007
5008     cb = &block;
5009     cb_init(cb, mem_block, mem_size/2);
5010
5011     ocb = &outline_block;
5012     cb_init(ocb, mem_block + mem_size/2, mem_size/2);
5013
5014     // Generate the interpreter exit code for leave
5015     leave_exit_code = yjit_gen_leave_exit(cb);
5016
5017     // Generate full exit code for C func
5018     gen_full_cfunc_return();
5019     cb_mark_all_executable(cb);
5020
5021     // Map YARV opcodes to the corresponding codegen functions
5022     yjit_reg_op(BIN(nop), gen_nop);
5023     yjit_reg_op(BIN(dup), gen_dup);
5024     yjit_reg_op(BIN(dupn), gen_dupn);
5025     yjit_reg_op(BIN(swap), gen_swap);
5026     yjit_reg_op(BIN(setn), gen_setn);
5027     yjit_reg_op(BIN(topn), gen_topn);
5028     yjit_reg_op(BIN(pop), gen_pop);
5029     yjit_reg_op(BIN(adjuststack), gen_adjuststack);
5030     yjit_reg_op(BIN(newarray), gen_newarray);
5031     yjit_reg_op(BIN(duparray), gen_duparray);
5032     yjit_reg_op(BIN(duphash), gen_duphash);
5033     yjit_reg_op(BIN(splatarray), gen_splatarray);
5034     yjit_reg_op(BIN(expandarray), gen_expandarray);
5035     yjit_reg_op(BIN(newhash), gen_newhash);
5036     yjit_reg_op(BIN(newrange), gen_newrange);
5037     yjit_reg_op(BIN(concatstrings), gen_concatstrings);
5038     yjit_reg_op(BIN(putnil), gen_putnil);
5039     yjit_reg_op(BIN(putobject), gen_putobject);
5040     yjit_reg_op(BIN(putstring), gen_putstring);
5041     yjit_reg_op(BIN(putobject_INT2FIX_0_), gen_putobject_int2fix);
5042     yjit_reg_op(BIN(putobject_INT2FIX_1_), gen_putobject_int2fix);
5043     yjit_reg_op(BIN(putself), gen_putself);
5044     yjit_reg_op(BIN(putspecialobject), gen_putspecialobject);
5045     yjit_reg_op(BIN(getlocal), gen_getlocal);
5046     yjit_reg_op(BIN(getlocal_WC_0), gen_getlocal_wc0);
5047     yjit_reg_op(BIN(getlocal_WC_1), gen_getlocal_wc1);
5048     yjit_reg_op(BIN(setlocal), gen_setlocal);
5049     yjit_reg_op(BIN(setlocal_WC_0), gen_setlocal_wc0);
5050     yjit_reg_op(BIN(setlocal_WC_1), gen_setlocal_wc1);
5051     yjit_reg_op(BIN(getinstancevariable), gen_getinstancevariable);
5052     yjit_reg_op(BIN(setinstancevariable), gen_setinstancevariable);
5053     yjit_reg_op(BIN(defined), gen_defined);
5054     yjit_reg_op(BIN(checktype), gen_checktype);
5055     yjit_reg_op(BIN(checkkeyword), gen_checkkeyword);
5056     yjit_reg_op(BIN(opt_lt), gen_opt_lt);
5057     yjit_reg_op(BIN(opt_le), gen_opt_le);
5058     yjit_reg_op(BIN(opt_ge), gen_opt_ge);
5059     yjit_reg_op(BIN(opt_gt), gen_opt_gt);
5060     yjit_reg_op(BIN(opt_eq), gen_opt_eq);
5061     yjit_reg_op(BIN(opt_neq), gen_opt_neq);
5062     yjit_reg_op(BIN(opt_aref), gen_opt_aref);
5063     yjit_reg_op(BIN(opt_aset), gen_opt_aset);
5064     yjit_reg_op(BIN(opt_and), gen_opt_and);
5065     yjit_reg_op(BIN(opt_or), gen_opt_or);
5066     yjit_reg_op(BIN(opt_minus), gen_opt_minus);
5067     yjit_reg_op(BIN(opt_plus), gen_opt_plus);
5068     yjit_reg_op(BIN(opt_mult), gen_opt_mult);
5069     yjit_reg_op(BIN(opt_div), gen_opt_div);
5070     yjit_reg_op(BIN(opt_mod), gen_opt_mod);
5071     yjit_reg_op(BIN(opt_ltlt), gen_opt_ltlt);
5072     yjit_reg_op(BIN(opt_nil_p), gen_opt_nil_p);
5073     yjit_reg_op(BIN(opt_empty_p), gen_opt_empty_p);
5074     yjit_reg_op(BIN(opt_str_freeze), gen_opt_str_freeze);
5075     yjit_reg_op(BIN(opt_str_uminus), gen_opt_str_uminus);
5076     yjit_reg_op(BIN(opt_not), gen_opt_not);
5077     yjit_reg_op(BIN(opt_size), gen_opt_size);
5078     yjit_reg_op(BIN(opt_length), gen_opt_length);
5079     yjit_reg_op(BIN(opt_regexpmatch2), gen_opt_regexpmatch2);
5080     yjit_reg_op(BIN(opt_getinlinecache), gen_opt_getinlinecache);
5081     yjit_reg_op(BIN(invokebuiltin), gen_invokebuiltin);
5082     yjit_reg_op(BIN(opt_invokebuiltin_delegate), gen_opt_invokebuiltin_delegate);
5083     yjit_reg_op(BIN(opt_invokebuiltin_delegate_leave), gen_opt_invokebuiltin_delegate);
5084     yjit_reg_op(BIN(opt_case_dispatch), gen_opt_case_dispatch);
5085     yjit_reg_op(BIN(branchif), gen_branchif);
5086     yjit_reg_op(BIN(branchunless), gen_branchunless);
5087     yjit_reg_op(BIN(branchnil), gen_branchnil);
5088     yjit_reg_op(BIN(jump), gen_jump);
5089     yjit_reg_op(BIN(getblockparamproxy), gen_getblockparamproxy);
5090     yjit_reg_op(BIN(opt_send_without_block), gen_opt_send_without_block);
5091     yjit_reg_op(BIN(send), gen_send);
5092     yjit_reg_op(BIN(invokesuper), gen_invokesuper);
5093     yjit_reg_op(BIN(leave), gen_leave);
5094     yjit_reg_op(BIN(getglobal), gen_getglobal);
5095     yjit_reg_op(BIN(setglobal), gen_setglobal);
5096     yjit_reg_op(BIN(anytostring), gen_anytostring);
5097     yjit_reg_op(BIN(objtostring), gen_objtostring);
5098     yjit_reg_op(BIN(toregexp), gen_toregexp);
5099     yjit_reg_op(BIN(intern), gen_intern);
5100     yjit_reg_op(BIN(getspecial), gen_getspecial);
5101     yjit_reg_op(BIN(getclassvariable), gen_getclassvariable);
5102     yjit_reg_op(BIN(setclassvariable), gen_setclassvariable);
5103
5104     yjit_method_codegen_table = st_init_numtable();
5105
5106     // Specialization for C methods. See yjit_reg_method() for details.
5107     yjit_reg_method(rb_cBasicObject, "!", jit_rb_obj_not);
5108
5109     yjit_reg_method(rb_cNilClass, "nil?", jit_rb_true);
5110     yjit_reg_method(rb_mKernel, "nil?", jit_rb_false);
5111
5112     yjit_reg_method(rb_cBasicObject, "==", jit_rb_obj_equal);
5113     yjit_reg_method(rb_cBasicObject, "equal?", jit_rb_obj_equal);
5114     yjit_reg_method(rb_mKernel, "eql?", jit_rb_obj_equal);
5115     yjit_reg_method(rb_cModule, "==", jit_rb_obj_equal);
5116     yjit_reg_method(rb_cSymbol, "==", jit_rb_obj_equal);
5117     yjit_reg_method(rb_cSymbol, "===", jit_rb_obj_equal);
5118
5119     // rb_str_to_s() methods in string.c
5120     yjit_reg_method(rb_cString, "to_s", jit_rb_str_to_s);
5121     yjit_reg_method(rb_cString, "to_str", jit_rb_str_to_s);
5122     yjit_reg_method(rb_cString, "bytesize", jit_rb_str_bytesize);
5123
5124     // Thread.current
5125     yjit_reg_method(rb_singleton_class(rb_cThread), "current", jit_thread_s_current);
5126 }