arch/x86/kernel/ftrace.c

   1 /*
   2  * Code for replacing ftrace calls with jumps.
   3  *
   4  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
   5  *
   6  * Thanks goes to Ingo Molnar, for suggesting the idea.
   7  * Mathieu Desnoyers, for suggesting postponing the modifications.
   8  * Arjan van de Ven, for keeping me straight, and explaining to me
   9  * the dangers of modifying code on the run.
  10  */
  11
  12 #include <linux/spinlock.h>
  13 #include <linux/hardirq.h>
  14 #include <linux/uaccess.h>
  15 #include <linux/ftrace.h>
  16 #include <linux/percpu.h>
  17 #include <linux/init.h>
  18 #include <linux/list.h>
  19
  20 #include <asm/ftrace.h>
  21 #include <linux/ftrace.h>
  22 #include <asm/nops.h>
  23 #include <asm/nmi.h>
  24
  25
  26
  27 #ifdef CONFIG_FUNCTION_RET_TRACER
  28
  29 /*
  30  * These functions are picked from those used on
  31  * this page for dynamic ftrace. They have been
  32  * simplified to ignore all traces in NMI context.
  33  */
  34 static atomic_t in_nmi;
  35
  36 void ftrace_nmi_enter(void)
  37 {
  38         atomic_inc(&in_nmi);
  39 }
  40
  41 void ftrace_nmi_exit(void)
  42 {
  43         atomic_dec(&in_nmi);
  44 }
  45
  46 /*
  47  * Synchronize accesses to return adresses stack with
  48  * interrupts.
  49  */
  50 static raw_spinlock_t ret_stack_lock;
  51
  52 /* Add a function return address to the trace stack on thread info.*/
  53 static int push_return_trace(unsigned long ret, unsigned long long time,
  54                                 unsigned long func)
  55 {
  56         int index;
  57         struct thread_info *ti;
  58         unsigned long flags;
  59         int err = 0;
  60
  61         raw_local_irq_save(flags);
  62         __raw_spin_lock(&ret_stack_lock);
  63
  64         ti = current_thread_info();
  65         /* The return trace stack is full */
  66         if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) {
  67                 err = -EBUSY;
  68                 goto out;
  69         }
  70
  71         index = ++ti->curr_ret_stack;
  72         ti->ret_stack[index].ret = ret;
  73         ti->ret_stack[index].func = func;
  74         ti->ret_stack[index].calltime = time;
  75
  76 out:
  77         __raw_spin_unlock(&ret_stack_lock);
  78         raw_local_irq_restore(flags);
  79         return err;
  80 }
  81
  82 /* Retrieve a function return address to the trace stack on thread info.*/
  83 static void pop_return_trace(unsigned long *ret, unsigned long long *time,
  84                                 unsigned long *func)
  85 {
  86         struct thread_info *ti;
  87         int index;
  88         unsigned long flags;
  89
  90         raw_local_irq_save(flags);
  91         __raw_spin_lock(&ret_stack_lock);
  92
  93         ti = current_thread_info();
  94         index = ti->curr_ret_stack;
  95         *ret = ti->ret_stack[index].ret;
  96         *func = ti->ret_stack[index].func;
  97         *time = ti->ret_stack[index].calltime;
  98         ti->curr_ret_stack--;
  99
 100         __raw_spin_unlock(&ret_stack_lock);
 101         raw_local_irq_restore(flags);
 102 }
 103
 104 /*
 105  * Send the trace to the ring-buffer.
 106  * @return the original return address.
 107  */
 108 unsigned long ftrace_return_to_handler(void)
 109 {
 110         struct ftrace_retfunc trace;
 111         pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
 112         trace.rettime = cpu_clock(raw_smp_processor_id());
 113         ftrace_function_return(&trace);
 114
 115         return trace.ret;
 116 }
 117
 118 /*
 119  * Hook the return address and push it in the stack of return addrs
 120  * in current thread info.
 121  */
 122 asmlinkage
 123 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 124 {
 125         unsigned long old;
 126         unsigned long long calltime;
 127         int faulted;
 128         unsigned long return_hooker = (unsigned long)
 129                                 &return_to_handler;
 130
 131         /* Nmi's are currently unsupported */
 132         if (atomic_read(&in_nmi))
 133                 return;
 134
 135         /*
 136          * Protect against fault, even if it shouldn't
 137          * happen. This tool is too much intrusive to
 138          * ignore such a protection.
 139          */
 140         asm volatile(
 141                 "1: movl (%[parent_old]), %[old]\n"
 142                 "2: movl %[return_hooker], (%[parent_replaced])\n"
 143                 "   movl $0, %[faulted]\n"
 144
 145                 ".section .fixup, \"ax\"\n"
 146                 "3: movl $1, %[faulted]\n"
 147                 ".previous\n"
 148
 149                 ".section __ex_table, \"a\"\n"
 150                 "   .long 1b, 3b\n"
 151                 "   .long 2b, 3b\n"
 152                 ".previous\n"
 153
 154                 : [parent_replaced] "=r" (parent), [old] "=r" (old),
 155                   [faulted] "=r" (faulted)
 156                 : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
 157                 : "memory"
 158         );
 159
 160         if (WARN_ON(faulted)) {
 161                 unregister_ftrace_return();
 162                 return;
 163         }
 164
 165         if (WARN_ON(!__kernel_text_address(old))) {
 166                 unregister_ftrace_return();
 167                 *parent = old;
 168                 return;
 169         }
 170
 171         calltime = cpu_clock(raw_smp_processor_id());
 172
 173         if (push_return_trace(old, calltime, self_addr) == -EBUSY)
 174                 *parent = old;
 175 }
 176
 177 static int __init init_ftrace_function_return(void)
 178 {
 179         ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 180         return 0;
 181 }
 182 device_initcall(init_ftrace_function_return);
 183
 184
 185 #endif
 186
 187 #ifdef CONFIG_DYNAMIC_FTRACE
 188
 189 union ftrace_code_union {
 190         char code[MCOUNT_INSN_SIZE];
 191         struct {
 192                 char e8;
 193                 int offset;
 194         } __attribute__((packed));
 195 };
 196
 197 static int ftrace_calc_offset(long ip, long addr)
 198 {
 199         return (int)(addr - ip);
 200 }
 201
 202 unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 203 {
 204         static union ftrace_code_union calc;
 205
 206         calc.e8         = 0xe8;
 207         calc.offset     = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
 208
 209         /*
 210          * No locking needed, this must be called via kstop_machine
 211          * which in essence is like running on a uniprocessor machine.
 212          */
 213         return calc.code;
 214 }
 215
 216 /*
 217  * Modifying code must take extra care. On an SMP machine, if
 218  * the code being modified is also being executed on another CPU
 219  * that CPU will have undefined results and possibly take a GPF.
 220  * We use kstop_machine to stop other CPUS from exectuing code.
 221  * But this does not stop NMIs from happening. We still need
 222  * to protect against that. We separate out the modification of
 223  * the code to take care of this.
 224  *
 225  * Two buffers are added: An IP buffer and a "code" buffer.
 226  *
 227  * 1) Put the instruction pointer into the IP buffer
 228  *    and the new code into the "code" buffer.
 229  * 2) Set a flag that says we are modifying code
 230  * 3) Wait for any running NMIs to finish.
 231  * 4) Write the code
 232  * 5) clear the flag.
 233  * 6) Wait for any running NMIs to finish.
 234  *
 235  * If an NMI is executed, the first thing it does is to call
 236  * "ftrace_nmi_enter". This will check if the flag is set to write
 237  * and if it is, it will write what is in the IP and "code" buffers.
 238  *
 239  * The trick is, it does not matter if everyone is writing the same
 240  * content to the code location. Also, if a CPU is executing code
 241  * it is OK to write to that code location if the contents being written
 242  * are the same as what exists.
 243  */
 244
 245 static atomic_t in_nmi = ATOMIC_INIT(0);
 246 static int mod_code_status;             /* holds return value of text write */
 247 static int mod_code_write;              /* set when NMI should do the write */
 248 static void *mod_code_ip;               /* holds the IP to write to */
 249 static void *mod_code_newcode;          /* holds the text to write to the IP */
 250
 251 static unsigned nmi_wait_count;
 252 static atomic_t nmi_update_count = ATOMIC_INIT(0);
 253
 254 int ftrace_arch_read_dyn_info(char *buf, int size)
 255 {
 256         int r;
 257
 258         r = snprintf(buf, size, "%u %u",
 259                      nmi_wait_count,
 260                      atomic_read(&nmi_update_count));
 261         return r;
 262 }
 263
 264 static void ftrace_mod_code(void)
 265 {
 266         /*
 267          * Yes, more than one CPU process can be writing to mod_code_status.
 268          *    (and the code itself)
 269          * But if one were to fail, then they all should, and if one were
 270          * to succeed, then they all should.
 271          */
 272         mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
 273                                              MCOUNT_INSN_SIZE);
 274
 275 }
 276
 277 void ftrace_nmi_enter(void)
 278 {
 279         atomic_inc(&in_nmi);
 280         /* Must have in_nmi seen before reading write flag */
 281         smp_mb();
 282         if (mod_code_write) {
 283                 ftrace_mod_code();
 284                 atomic_inc(&nmi_update_count);
 285         }
 286 }
 287
 288 void ftrace_nmi_exit(void)
 289 {
 290         /* Finish all executions before clearing in_nmi */
 291         smp_wmb();
 292         atomic_dec(&in_nmi);
 293 }
 294
 295 static void wait_for_nmi(void)
 296 {
 297         int waited = 0;
 298
 299         while (atomic_read(&in_nmi)) {
 300                 waited = 1;
 301                 cpu_relax();
 302         }
 303
 304         if (waited)
 305                 nmi_wait_count++;
 306 }
 307
 308 static int
 309 do_ftrace_mod_code(unsigned long ip, void *new_code)
 310 {
 311         mod_code_ip = (void *)ip;
 312         mod_code_newcode = new_code;
 313
 314         /* The buffers need to be visible before we let NMIs write them */
 315         smp_wmb();
 316
 317         mod_code_write = 1;
 318
 319         /* Make sure write bit is visible before we wait on NMIs */
 320         smp_mb();
 321
 322         wait_for_nmi();
 323
 324         /* Make sure all running NMIs have finished before we write the code */
 325         smp_mb();
 326
 327         ftrace_mod_code();
 328
 329         /* Make sure the write happens before clearing the bit */
 330         smp_wmb();
 331
 332         mod_code_write = 0;
 333
 334         /* make sure NMIs see the cleared bit */
 335         smp_mb();
 336
 337         wait_for_nmi();
 338
 339         return mod_code_status;
 340 }
 341
 342
 343
 344
 345 static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
 346
 347 unsigned char *ftrace_nop_replace(void)
 348 {
 349         return ftrace_nop;
 350 }
 351
 352 int
 353 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 354                    unsigned char *new_code)
 355 {
 356         unsigned char replaced[MCOUNT_INSN_SIZE];
 357
 358         /*
 359          * Note: Due to modules and __init, code can
 360          *  disappear and change, we need to protect against faulting
 361          *  as well as code changing. We do this by using the
 362          *  probe_kernel_* functions.
 363          *
 364          * No real locking needed, this code is run through
 365          * kstop_machine, or before SMP starts.
 366          */
 367
 368         /* read the text we want to modify */
 369         if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
 370                 return -EFAULT;
 371
 372         /* Make sure it is what we expect it to be */
 373         if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
 374                 return -EINVAL;
 375
 376         /* replace the text with the new text */
 377         if (do_ftrace_mod_code(ip, new_code))
 378                 return -EPERM;
 379
 380         sync_core();
 381
 382         return 0;
 383 }
 384
 385 int ftrace_update_ftrace_func(ftrace_func_t func)
 386 {
 387         unsigned long ip = (unsigned long)(&ftrace_call);
 388         unsigned char old[MCOUNT_INSN_SIZE], *new;
 389         int ret;
 390
 391         memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 392         new = ftrace_call_replace(ip, (unsigned long)func);
 393         ret = ftrace_modify_code(ip, old, new);
 394
 395         return ret;
 396 }
 397
 398 int __init ftrace_dyn_arch_init(void *data)
 399 {
 400         extern const unsigned char ftrace_test_p6nop[];
 401         extern const unsigned char ftrace_test_nop5[];
 402         extern const unsigned char ftrace_test_jmp[];
 403         int faulted = 0;
 404
 405         /*
 406          * There is no good nop for all x86 archs.
 407          * We will default to using the P6_NOP5, but first we
 408          * will test to make sure that the nop will actually
 409          * work on this CPU. If it faults, we will then
 410          * go to a lesser efficient 5 byte nop. If that fails
 411          * we then just use a jmp as our nop. This isn't the most
 412          * efficient nop, but we can not use a multi part nop
 413          * since we would then risk being preempted in the middle
 414          * of that nop, and if we enabled tracing then, it might
 415          * cause a system crash.
 416          *
 417          * TODO: check the cpuid to determine the best nop.
 418          */
 419         asm volatile (
 420                 "ftrace_test_jmp:"
 421                 "jmp ftrace_test_p6nop\n"
 422                 "nop\n"
 423                 "nop\n"
 424                 "nop\n"  /* 2 byte jmp + 3 bytes */
 425                 "ftrace_test_p6nop:"
 426                 P6_NOP5
 427                 "jmp 1f\n"
 428                 "ftrace_test_nop5:"
 429                 ".byte 0x66,0x66,0x66,0x66,0x90\n"
 430                 "1:"
 431                 ".section .fixup, \"ax\"\n"
 432                 "2:     movl $1, %0\n"
 433                 "       jmp ftrace_test_nop5\n"
 434                 "3:     movl $2, %0\n"
 435                 "       jmp 1b\n"
 436                 ".previous\n"
 437                 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
 438                 _ASM_EXTABLE(ftrace_test_nop5, 3b)
 439                 : "=r"(faulted) : "0" (faulted));
 440
 441         switch (faulted) {
 442         case 0:
 443                 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
 444                 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
 445                 break;
 446         case 1:
 447                 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
 448                 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
 449                 break;
 450         case 2:
 451                 pr_info("ftrace: converting mcount calls to jmp . + 5\n");
 452                 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
 453                 break;
 454         }
 455
 456         /* The return code is retured via data */
 457         *(unsigned long *)data = 0;
 458
 459         return 0;
 460 }
 461 #endif