arch/powerpc/oprofile/op_model_cell.c

   1 /*
   2  * Cell Broadband Engine OProfile Support
   3  *
   4  * (C) Copyright IBM Corporation 2006
   5  *
   6  * Author: David Erb (djerb@us.ibm.com)
   7  * Modifications:
   8  *         Carl Love <carll@us.ibm.com>
   9  *         Maynard Johnson <maynardj@us.ibm.com>
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License
  13  * as published by the Free Software Foundation; either version
  14  * 2 of the License, or (at your option) any later version.
  15  */
  16
  17 #include <linux/cpufreq.h>
  18 #include <linux/delay.h>
  19 #include <linux/init.h>
  20 #include <linux/jiffies.h>
  21 #include <linux/kthread.h>
  22 #include <linux/oprofile.h>
  23 #include <linux/percpu.h>
  24 #include <linux/smp.h>
  25 #include <linux/spinlock.h>
  26 #include <linux/timer.h>
  27 #include <asm/cell-pmu.h>
  28 #include <asm/cputable.h>
  29 #include <asm/firmware.h>
  30 #include <asm/io.h>
  31 #include <asm/oprofile_impl.h>
  32 #include <asm/processor.h>
  33 #include <asm/prom.h>
  34 #include <asm/ptrace.h>
  35 #include <asm/reg.h>
  36 #include <asm/rtas.h>
  37 #include <asm/cell-regs.h>
  38
  39 #include "../platforms/cell/interrupt.h"
  40 #include "cell/pr_util.h"
  41
  42 #define PPU_PROFILING            0
  43 #define SPU_PROFILING_CYCLES     1
  44 #define SPU_PROFILING_EVENTS     2
  45
  46 #define SPU_EVENT_NUM_START      4100
  47 #define SPU_EVENT_NUM_STOP       4399
  48 #define SPU_PROFILE_EVENT_ADDR          4363  /* spu, address trace, decimal */
  49 #define SPU_PROFILE_EVENT_ADDR_MASK_A   0x146 /* sub unit set to zero */
  50 #define SPU_PROFILE_EVENT_ADDR_MASK_B   0x186 /* sub unit set to zero */
  51
  52 #define NUM_SPUS_PER_NODE    8
  53 #define SPU_CYCLES_EVENT_NUM 2  /*  event number for SPU_CYCLES */
  54
  55 #define PPU_CYCLES_EVENT_NUM 1  /*  event number for CYCLES */
  56 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
  57                                  * PPU_CYCLES event
  58                                  */
  59 #define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */
  60
  61 #define NUM_THREADS 2         /* number of physical threads in
  62                                * physical processor
  63                                */
  64 #define NUM_DEBUG_BUS_WORDS 4
  65 #define NUM_INPUT_BUS_WORDS 2
  66
  67 #define MAX_SPU_COUNT 0xFFFFFF  /* maximum 24 bit LFSR value */
  68
  69 /* Minimum HW interval timer setting to send value to trace buffer is 10 cycle.
  70  * To configure counter to send value every N cycles set counter to
  71  * 2^32 - 1 - N.
  72  */
  73 #define NUM_INTERVAL_CYC  0xFFFFFFFF - 10
  74
  75 /*
  76  * spu_cycle_reset is the number of cycles between samples.
  77  * This variable is used for SPU profiling and should ONLY be set
  78  * at the beginning of cell_reg_setup; otherwise, it's read-only.
  79  */
  80 static unsigned int spu_cycle_reset;
  81 static unsigned int profiling_mode;
  82 static int spu_evnt_phys_spu_indx;
  83
  84 struct pmc_cntrl_data {
  85         unsigned long vcntr;
  86         unsigned long evnts;
  87         unsigned long masks;
  88         unsigned long enabled;
  89 };
  90
  91 /*
  92  * ibm,cbe-perftools rtas parameters
  93  */
  94 struct pm_signal {
  95         u16 cpu;                /* Processor to modify */
  96         u16 sub_unit;           /* hw subunit this applies to (if applicable)*/
  97         short int signal_group; /* Signal Group to Enable/Disable */
  98         u8 bus_word;            /* Enable/Disable on this Trace/Trigger/Event
  99                                  * Bus Word(s) (bitmask)
 100                                  */
 101         u8 bit;                 /* Trigger/Event bit (if applicable) */
 102 };
 103
 104 /*
 105  * rtas call arguments
 106  */
 107 enum {
 108         SUBFUNC_RESET = 1,
 109         SUBFUNC_ACTIVATE = 2,
 110         SUBFUNC_DEACTIVATE = 3,
 111
 112         PASSTHRU_IGNORE = 0,
 113         PASSTHRU_ENABLE = 1,
 114         PASSTHRU_DISABLE = 2,
 115 };
 116
 117 struct pm_cntrl {
 118         u16 enable;
 119         u16 stop_at_max;
 120         u16 trace_mode;
 121         u16 freeze;
 122         u16 count_mode;
 123         u16 spu_addr_trace;
 124         u8  trace_buf_ovflw;
 125 };
 126
 127 static struct {
 128         u32 group_control;
 129         u32 debug_bus_control;
 130         struct pm_cntrl pm_cntrl;
 131         u32 pm07_cntrl[NR_PHYS_CTRS];
 132 } pm_regs;
 133
 134 #define GET_SUB_UNIT(x) ((x & 0x0000f000) >> 12)
 135 #define GET_BUS_WORD(x) ((x & 0x000000f0) >> 4)
 136 #define GET_BUS_TYPE(x) ((x & 0x00000300) >> 8)
 137 #define GET_POLARITY(x) ((x & 0x00000002) >> 1)
 138 #define GET_COUNT_CYCLES(x) (x & 0x00000001)
 139 #define GET_INPUT_CONTROL(x) ((x & 0x00000004) >> 2)
 140
 141 static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
 142 static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE];
 143 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 144
 145 /*
 146  * The CELL profiling code makes rtas calls to setup the debug bus to
 147  * route the performance signals.  Additionally, SPU profiling requires
 148  * a second rtas call to setup the hardware to capture the SPU PCs.
 149  * The EIO error value is returned if the token lookups or the rtas
 150  * call fail.  The EIO error number is the best choice of the existing
 151  * error numbers.  The probability of rtas related error is very low.  But
 152  * by returning EIO and printing additional information to dmsg the user
 153  * will know that OProfile did not start and dmesg will tell them why.
 154  * OProfile does not support returning errors on Stop.  Not a huge issue
 155  * since failure to reset the debug bus or stop the SPU PC collection is
 156  * not a fatel issue.  Chances are if the Stop failed, Start doesn't work
 157  * either.
 158  */
 159
 160 /*
 161  * Interpetation of hdw_thread:
 162  * 0 - even virtual cpus 0, 2, 4,...
 163  * 1 - odd virtual cpus 1, 3, 5, ...
 164  *
 165  * FIXME: this is strictly wrong, we need to clean this up in a number
 166  * of places. It works for now. -arnd
 167  */
 168 static u32 hdw_thread;
 169
 170 static u32 virt_cntr_inter_mask;
 171 static struct timer_list timer_virt_cntr;
 172 static struct timer_list timer_spu_event_swap;
 173
 174 /*
 175  * pm_signal needs to be global since it is initialized in
 176  * cell_reg_setup at the time when the necessary information
 177  * is available.
 178  */
 179 static struct pm_signal pm_signal[NR_PHYS_CTRS];
 180 static int pm_rtas_token;    /* token for debug bus setup call */
 181 static int spu_rtas_token;   /* token for SPU cycle profiling */
 182
 183 static u32 reset_value[NR_PHYS_CTRS];
 184 static int num_counters;
 185 static int oprofile_running;
 186 static DEFINE_SPINLOCK(cntr_lock);
 187
 188 static u32 ctr_enabled;
 189
 190 static unsigned char input_bus[NUM_INPUT_BUS_WORDS];
 191
 192 /*
 193  * Firmware interface functions
 194  */
 195 static int
 196 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 197                        void *address, unsigned long length)
 198 {
 199         u64 paddr = __pa(address);
 200
 201         return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc,
 202                          passthru, paddr >> 32, paddr & 0xffffffff, length);
 203 }
 204
 205 static void pm_rtas_reset_signals(u32 node)
 206 {
 207         int ret;
 208         struct pm_signal pm_signal_local;
 209
 210         /*
 211          * The debug bus is being set to the passthru disable state.
 212          * However, the FW still expects atleast one legal signal routing
 213          * entry or it will return an error on the arguments.   If we don't
 214          * supply a valid entry, we must ignore all return values.  Ignoring
 215          * all return values means we might miss an error we should be
 216          * concerned about.
 217          */
 218
 219         /*  fw expects physical cpu #. */
 220         pm_signal_local.cpu = node;
 221         pm_signal_local.signal_group = 21;
 222         pm_signal_local.bus_word = 1;
 223         pm_signal_local.sub_unit = 0;
 224         pm_signal_local.bit = 0;
 225
 226         ret = rtas_ibm_cbe_perftools(SUBFUNC_RESET, PASSTHRU_DISABLE,
 227                                      &pm_signal_local,
 228                                      sizeof(struct pm_signal));
 229
 230         if (unlikely(ret))
 231                 /*
 232                  * Not a fatal error. For Oprofile stop, the oprofile
 233                  * functions do not support returning an error for
 234                  * failure to stop OProfile.
 235                  */
 236                 printk(KERN_WARNING "%s: rtas returned: %d\n",
 237                        __func__, ret);
 238 }
 239
 240 static int pm_rtas_activate_signals(u32 node, u32 count)
 241 {
 242         int ret;
 243         int i, j;
 244         struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 245
 246         /*
 247          * There is no debug setup required for the cycles event.
 248          * Note that only events in the same group can be used.
 249          * Otherwise, there will be conflicts in correctly routing
 250          * the signals on the debug bus.  It is the responsibility
 251          * of the OProfile user tool to check the events are in
 252          * the same group.
 253          */
 254         i = 0;
 255         for (j = 0; j < count; j++) {
 256                 if (pm_signal[j].signal_group != PPU_CYCLES_GRP_NUM) {
 257
 258                         /* fw expects physical cpu # */
 259                         pm_signal_local[i].cpu = node;
 260                         pm_signal_local[i].signal_group
 261                                 = pm_signal[j].signal_group;
 262                         pm_signal_local[i].bus_word = pm_signal[j].bus_word;
 263                         pm_signal_local[i].sub_unit = pm_signal[j].sub_unit;
 264                         pm_signal_local[i].bit = pm_signal[j].bit;
 265                         i++;
 266                 }
 267         }
 268
 269         if (i != 0) {
 270                 ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, PASSTHRU_ENABLE,
 271                                              pm_signal_local,
 272                                              i * sizeof(struct pm_signal));
 273
 274                 if (unlikely(ret)) {
 275                         printk(KERN_WARNING "%s: rtas returned: %d\n",
 276                                __func__, ret);
 277                         return -EIO;
 278                 }
 279         }
 280
 281         return 0;
 282 }
 283
 284 /*
 285  * PM Signal functions
 286  */
 287 static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 288 {
 289         struct pm_signal *p;
 290         u32 signal_bit;
 291         u32 bus_word, bus_type, count_cycles, polarity, input_control;
 292         int j, i;
 293
 294         if (event == PPU_CYCLES_EVENT_NUM) {
 295                 /* Special Event: Count all cpu cycles */
 296                 pm_regs.pm07_cntrl[ctr] = CBE_COUNT_ALL_CYCLES;
 297                 p = &(pm_signal[ctr]);
 298                 p->signal_group = PPU_CYCLES_GRP_NUM;
 299                 p->bus_word = 1;
 300                 p->sub_unit = 0;
 301                 p->bit = 0;
 302                 goto out;
 303         } else {
 304                 pm_regs.pm07_cntrl[ctr] = 0;
 305         }
 306
 307         bus_word = GET_BUS_WORD(unit_mask);
 308         bus_type = GET_BUS_TYPE(unit_mask);
 309         count_cycles = GET_COUNT_CYCLES(unit_mask);
 310         polarity = GET_POLARITY(unit_mask);
 311         input_control = GET_INPUT_CONTROL(unit_mask);
 312         signal_bit = (event % 100);
 313
 314         p = &(pm_signal[ctr]);
 315
 316         p->signal_group = event / 100;
 317         p->bus_word = bus_word;
 318         p->sub_unit = GET_SUB_UNIT(unit_mask);
 319
 320         pm_regs.pm07_cntrl[ctr] = 0;
 321         pm_regs.pm07_cntrl[ctr] |= PM07_CTR_COUNT_CYCLES(count_cycles);
 322         pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
 323         pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
 324
 325         /*
 326          * Some of the islands signal selection is based on 64 bit words.
 327          * The debug bus words are 32 bits, the input words to the performance
 328          * counters are defined as 32 bits.  Need to convert the 64 bit island
 329          * specification to the appropriate 32 input bit and bus word for the
 330          * performance counter event selection.  See the CELL Performance
 331          * monitoring signals manual and the Perf cntr hardware descriptions
 332          * for the details.
 333          */
 334         if (input_control == 0) {
 335                 if (signal_bit > 31) {
 336                         signal_bit -= 32;
 337                         if (bus_word == 0x3)
 338                                 bus_word = 0x2;
 339                         else if (bus_word == 0xc)
 340                                 bus_word = 0x8;
 341                 }
 342
 343                 if ((bus_type == 0) && p->signal_group >= 60)
 344                         bus_type = 2;
 345                 if ((bus_type == 1) && p->signal_group >= 50)
 346                         bus_type = 0;
 347
 348                 pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_MUX(signal_bit);
 349         } else {
 350                 pm_regs.pm07_cntrl[ctr] = 0;
 351                 p->bit = signal_bit;
 352         }
 353
 354         for (i = 0; i < NUM_DEBUG_BUS_WORDS; i++) {
 355                 if (bus_word & (1 << i)) {
 356                         pm_regs.debug_bus_control |=
 357                                 (bus_type << (30 - (2 * i)));
 358
 359                         for (j = 0; j < NUM_INPUT_BUS_WORDS; j++) {
 360                                 if (input_bus[j] == 0xff) {
 361                                         input_bus[j] = i;
 362                                         pm_regs.group_control |=
 363                                                 (i << (30 - (2 * j)));
 364
 365                                         break;
 366                                 }
 367                         }
 368                 }
 369         }
 370 out:
 371         ;
 372 }
 373
 374 static void write_pm_cntrl(int cpu)
 375 {
 376         /*
 377          * Oprofile will use 32 bit counters, set bits 7:10 to 0
 378          * pmregs.pm_cntrl is a global
 379          */
 380
 381         u32 val = 0;
 382         if (pm_regs.pm_cntrl.enable == 1)
 383                 val |= CBE_PM_ENABLE_PERF_MON;
 384
 385         if (pm_regs.pm_cntrl.stop_at_max == 1)
 386                 val |= CBE_PM_STOP_AT_MAX;
 387
 388         if (pm_regs.pm_cntrl.trace_mode != 0)
 389                 val |= CBE_PM_TRACE_MODE_SET(pm_regs.pm_cntrl.trace_mode);
 390
 391         if (pm_regs.pm_cntrl.trace_buf_ovflw == 1)
 392                 val |= CBE_PM_TRACE_BUF_OVFLW(pm_regs.pm_cntrl.trace_buf_ovflw);
 393         if (pm_regs.pm_cntrl.freeze == 1)
 394                 val |= CBE_PM_FREEZE_ALL_CTRS;
 395
 396         val |= CBE_PM_SPU_ADDR_TRACE_SET(pm_regs.pm_cntrl.spu_addr_trace);
 397
 398         /*
 399          * Routine set_count_mode must be called previously to set
 400          * the count mode based on the user selection of user and kernel.
 401          */
 402         val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
 403         cbe_write_pm(cpu, pm_control, val);
 404 }
 405
 406 static inline void
 407 set_count_mode(u32 kernel, u32 user)
 408 {
 409         /*
 410          * The user must specify user and kernel if they want them. If
 411          *  neither is specified, OProfile will count in hypervisor mode.
 412          *  pm_regs.pm_cntrl is a global
 413          */
 414         if (kernel) {
 415                 if (user)
 416                         pm_regs.pm_cntrl.count_mode = CBE_COUNT_ALL_MODES;
 417                 else
 418                         pm_regs.pm_cntrl.count_mode =
 419                                 CBE_COUNT_SUPERVISOR_MODE;
 420         } else {
 421                 if (user)
 422                         pm_regs.pm_cntrl.count_mode = CBE_COUNT_PROBLEM_MODE;
 423                 else
 424                         pm_regs.pm_cntrl.count_mode =
 425                                 CBE_COUNT_HYPERVISOR_MODE;
 426         }
 427 }
 428
 429 static inline void enable_ctr(u32 cpu, u32 ctr, u32 *pm07_cntrl)
 430 {
 431
 432         pm07_cntrl[ctr] |= CBE_PM_CTR_ENABLE;
 433         cbe_write_pm07_control(cpu, ctr, pm07_cntrl[ctr]);
 434 }
 435
 436 /*
 437  * Oprofile is expected to collect data on all CPUs simultaneously.
 438  * However, there is one set of performance counters per node.  There are
 439  * two hardware threads or virtual CPUs on each node.  Hence, OProfile must
 440  * multiplex in time the performance counter collection on the two virtual
 441  * CPUs.  The multiplexing of the performance counters is done by this
 442  * virtual counter routine.
 443  *
 444  * The pmc_values used below is defined as 'per-cpu' but its use is
 445  * more akin to 'per-node'.  We need to store two sets of counter
 446  * values per node -- one for the previous run and one for the next.
 447  * The per-cpu[NR_PHYS_CTRS] gives us the storage we need.  Each odd/even
 448  * pair of per-cpu arrays is used for storing the previous and next
 449  * pmc values for a given node.
 450  * NOTE: We use the per-cpu variable to improve cache performance.
 451  *
 452  * This routine will alternate loading the virtual counters for
 453  * virtual CPUs
 454  */
 455 static void cell_virtual_cntr(unsigned long data)
 456 {
 457         int i, prev_hdw_thread, next_hdw_thread;
 458         u32 cpu;
 459         unsigned long flags;
 460
 461         /*
 462          * Make sure that the interrupt_hander and the virt counter are
 463          * not both playing with the counters on the same node.
 464          */
 465
 466         spin_lock_irqsave(&cntr_lock, flags);
 467
 468         prev_hdw_thread = hdw_thread;
 469
 470         /* switch the cpu handling the interrupts */
 471         hdw_thread = 1 ^ hdw_thread;
 472         next_hdw_thread = hdw_thread;
 473
 474         pm_regs.group_control = 0;
 475         pm_regs.debug_bus_control = 0;
 476
 477         for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
 478                 input_bus[i] = 0xff;
 479
 480         /*
 481          * There are some per thread events.  Must do the
 482          * set event, for the thread that is being started
 483          */
 484         for (i = 0; i < num_counters; i++)
 485                 set_pm_event(i,
 486                         pmc_cntrl[next_hdw_thread][i].evnts,
 487                         pmc_cntrl[next_hdw_thread][i].masks);
 488
 489         /*
 490          * The following is done only once per each node, but
 491          * we need cpu #, not node #, to pass to the cbe_xxx functions.
 492          */
 493         for_each_online_cpu(cpu) {
 494                 if (cbe_get_hw_thread_id(cpu))
 495                         continue;
 496
 497                 /*
 498                  * stop counters, save counter values, restore counts
 499                  * for previous thread
 500                  */
 501                 cbe_disable_pm(cpu);
 502                 cbe_disable_pm_interrupts(cpu);
 503                 for (i = 0; i < num_counters; i++) {
 504                         per_cpu(pmc_values, cpu + prev_hdw_thread)[i]
 505                                 = cbe_read_ctr(cpu, i);
 506
 507                         if (per_cpu(pmc_values, cpu + next_hdw_thread)[i]
 508                             == 0xFFFFFFFF)
 509                                 /* If the cntr value is 0xffffffff, we must
 510                                  * reset that to 0xfffffff0 when the current
 511                                  * thread is restarted.  This will generate a
 512                                  * new interrupt and make sure that we never
 513                                  * restore the counters to the max value.  If
 514                                  * the counters were restored to the max value,
 515                                  * they do not increment and no interrupts are
 516                                  * generated.  Hence no more samples will be
 517                                  * collected on that cpu.
 518                                  */
 519                                 cbe_write_ctr(cpu, i, 0xFFFFFFF0);
 520                         else
 521                                 cbe_write_ctr(cpu, i,
 522                                               per_cpu(pmc_values,
 523                                                       cpu +
 524                                                       next_hdw_thread)[i]);
 525                 }
 526
 527                 /*
 528                  * Switch to the other thread. Change the interrupt
 529                  * and control regs to be scheduled on the CPU
 530                  * corresponding to the thread to execute.
 531                  */
 532                 for (i = 0; i < num_counters; i++) {
 533                         if (pmc_cntrl[next_hdw_thread][i].enabled) {
 534                                 /*
 535                                  * There are some per thread events.
 536                                  * Must do the set event, enable_cntr
 537                                  * for each cpu.
 538                                  */
 539                                 enable_ctr(cpu, i,
 540                                            pm_regs.pm07_cntrl);
 541                         } else {
 542                                 cbe_write_pm07_control(cpu, i, 0);
 543                         }
 544                 }
 545
 546                 /* Enable interrupts on the CPU thread that is starting */
 547                 cbe_enable_pm_interrupts(cpu, next_hdw_thread,
 548                                          virt_cntr_inter_mask);
 549                 cbe_enable_pm(cpu);
 550         }
 551
 552         spin_unlock_irqrestore(&cntr_lock, flags);
 553
 554         mod_timer(&timer_virt_cntr, jiffies + HZ / 10);
 555 }
 556
 557 static void start_virt_cntrs(void)
 558 {
 559         init_timer(&timer_virt_cntr);
 560         timer_virt_cntr.function = cell_virtual_cntr;
 561         timer_virt_cntr.data = 0UL;
 562         timer_virt_cntr.expires = jiffies + HZ / 10;
 563         add_timer(&timer_virt_cntr);
 564 }
 565
 566 static int cell_reg_setup_spu_cycles(struct op_counter_config *ctr,
 567                         struct op_system_config *sys, int num_ctrs)
 568 {
 569         spu_cycle_reset = ctr[0].count;
 570
 571         /*
 572          * Each node will need to make the rtas call to start
 573          * and stop SPU profiling.  Get the token once and store it.
 574          */
 575         spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
 576
 577         if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
 578                 printk(KERN_ERR
 579                        "%s: rtas token ibm,cbe-spu-perftools unknown\n",
 580                        __func__);
 581                 return -EIO;
 582         }
 583         return 0;
 584 }
 585
 586 /* Unfortunately, the hardware will only support event profiling
 587  * on one SPU per node at a time.  Therefore, we must time slice
 588  * the profiling across all SPUs in the node.  Note, we do this
 589  * in parallel for each node.  The following routine is called
 590  * periodically based on kernel timer to switch which SPU is
 591  * being monitored in a round robbin fashion.
 592  */
 593 static void spu_evnt_swap(unsigned long data)
 594 {
 595         int node;
 596         int cur_phys_spu, nxt_phys_spu, cur_spu_evnt_phys_spu_indx;
 597         unsigned long flags;
 598         int cpu;
 599         int ret;
 600         u32 interrupt_mask;
 601
 602
 603         /* enable interrupts on cntr 0 */
 604         interrupt_mask = CBE_PM_CTR_OVERFLOW_INTR(0);
 605
 606         hdw_thread = 0;
 607
 608         /* Make sure spu event interrupt handler and spu event swap
 609          * don't access the counters simultaneously.
 610          */
 611         spin_lock_irqsave(&cntr_lock, flags);
 612
 613         cur_spu_evnt_phys_spu_indx = spu_evnt_phys_spu_indx;
 614
 615         if (++(spu_evnt_phys_spu_indx) == NUM_SPUS_PER_NODE)
 616                 spu_evnt_phys_spu_indx = 0;
 617
 618         pm_signal[0].sub_unit = spu_evnt_phys_spu_indx;
 619         pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
 620         pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
 621
 622         /* switch the SPU being profiled on each node */
 623         for_each_online_cpu(cpu) {
 624                 if (cbe_get_hw_thread_id(cpu))
 625                         continue;
 626
 627                 node = cbe_cpu_to_node(cpu);
 628                 cur_phys_spu = (node * NUM_SPUS_PER_NODE)
 629                         + cur_spu_evnt_phys_spu_indx;
 630                 nxt_phys_spu = (node * NUM_SPUS_PER_NODE)
 631                         + spu_evnt_phys_spu_indx;
 632
 633                 /*
 634                  * stop counters, save counter values, restore counts
 635                  * for previous physical SPU
 636                  */
 637                 cbe_disable_pm(cpu);
 638                 cbe_disable_pm_interrupts(cpu);
 639
 640                 spu_pm_cnt[cur_phys_spu]
 641                         = cbe_read_ctr(cpu, 0);
 642
 643                 /* restore previous count for the next spu to sample */
 644                 /* NOTE, hardware issue, counter will not start if the
 645                  * counter value is at max (0xFFFFFFFF).
 646                  */
 647                 if (spu_pm_cnt[nxt_phys_spu] >= 0xFFFFFFFF)
 648                         cbe_write_ctr(cpu, 0, 0xFFFFFFF0);
 649                  else
 650                          cbe_write_ctr(cpu, 0, spu_pm_cnt[nxt_phys_spu]);
 651
 652                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
 653
 654                 /* setup the debug bus measure the one event and
 655                  * the two events to route the next SPU's PC on
 656                  * the debug bus
 657                  */
 658                 ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu), 3);
 659                 if (ret)
 660                         printk(KERN_ERR "%s: pm_rtas_activate_signals failed, "
 661                                "SPU event swap\n", __func__);
 662
 663                 /* clear the trace buffer, don't want to take PC for
 664                  * previous SPU*/
 665                 cbe_write_pm(cpu, trace_address, 0);
 666
 667                 enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
 668
 669                 /* Enable interrupts on the CPU thread that is starting */
 670                 cbe_enable_pm_interrupts(cpu, hdw_thread,
 671                                          interrupt_mask);
 672                 cbe_enable_pm(cpu);
 673         }
 674
 675         spin_unlock_irqrestore(&cntr_lock, flags);
 676
 677         /* swap approximately every 0.1 seconds */
 678         mod_timer(&timer_spu_event_swap, jiffies + HZ / 25);
 679 }
 680
 681 static void start_spu_event_swap(void)
 682 {
 683         init_timer(&timer_spu_event_swap);
 684         timer_spu_event_swap.function = spu_evnt_swap;
 685         timer_spu_event_swap.data = 0UL;
 686         timer_spu_event_swap.expires = jiffies + HZ / 25;
 687         add_timer(&timer_spu_event_swap);
 688 }
 689
 690 static int cell_reg_setup_spu_events(struct op_counter_config *ctr,
 691                         struct op_system_config *sys, int num_ctrs)
 692 {
 693         int i;
 694
 695         /* routine is called once for all nodes */
 696
 697         spu_evnt_phys_spu_indx = 0;
 698         /*
 699          * For all events except PPU CYCLEs, each node will need to make
 700          * the rtas cbe-perftools call to setup and reset the debug bus.
 701          * Make the token lookup call once and store it in the global
 702          * variable pm_rtas_token.
 703          */
 704         pm_rtas_token = rtas_token("ibm,cbe-perftools");
 705
 706         if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
 707                 printk(KERN_ERR
 708                        "%s: rtas token ibm,cbe-perftools unknown\n",
 709                        __func__);
 710                 return -EIO;
 711         }
 712
 713         /* setup the pm_control register settings,
 714          * settings will be written per node by the
 715          * cell_cpu_setup() function.
 716          */
 717         pm_regs.pm_cntrl.trace_buf_ovflw = 1;
 718
 719         /* Use the occurrence trace mode to have SPU PC saved
 720          * to the trace buffer.  Occurrence data in trace buffer
 721          * is not used.  Bit 2 must be set to store SPU addresses.
 722          */
 723         pm_regs.pm_cntrl.trace_mode = 2;
 724
 725         pm_regs.pm_cntrl.spu_addr_trace = 0x1;  /* using debug bus
 726                                                    event 2 & 3 */
 727
 728         /* setup the debug bus event array with the SPU PC routing events.
 729         *  Note, pm_signal[0] will be filled in by set_pm_event() call below.
 730         */
 731         pm_signal[1].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
 732         pm_signal[1].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_A);
 733         pm_signal[1].bit = SPU_PROFILE_EVENT_ADDR % 100;
 734         pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
 735
 736         pm_signal[2].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
 737         pm_signal[2].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_B);
 738         pm_signal[2].bit = SPU_PROFILE_EVENT_ADDR % 100;
 739         pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
 740
 741         /* Set the user selected spu event to profile on,
 742          * note, only one SPU profiling event is supported
 743          */
 744         num_counters = 1;  /* Only support one SPU event at a time */
 745         set_pm_event(0, ctr[0].event, ctr[0].unit_mask);
 746
 747         reset_value[0] = 0xFFFFFFFF - ctr[0].count;
 748
 749         /* global, used by cell_cpu_setup */
 750         ctr_enabled |= 1;
 751
 752         /* Initialize the count for each SPU to the reset value */
 753         for (i=0; i < MAX_NUMNODES * NUM_SPUS_PER_NODE; i++)
 754                 spu_pm_cnt[i] = reset_value[0];
 755
 756         return 0;
 757 }
 758
 759 static int cell_reg_setup_ppu(struct op_counter_config *ctr,
 760                         struct op_system_config *sys, int num_ctrs)
 761 {
 762         /* routine is called once for all nodes */
 763         int i, j, cpu;
 764
 765         num_counters = num_ctrs;
 766
 767         if (unlikely(num_ctrs > NR_PHYS_CTRS)) {
 768                 printk(KERN_ERR
 769                        "%s: Oprofile, number of specified events " \
 770                        "exceeds number of physical counters\n",
 771                        __func__);
 772                 return -EIO;
 773         }
 774
 775         set_count_mode(sys->enable_kernel, sys->enable_user);
 776
 777         /* Setup the thread 0 events */
 778         for (i = 0; i < num_ctrs; ++i) {
 779
 780                 pmc_cntrl[0][i].evnts = ctr[i].event;
 781                 pmc_cntrl[0][i].masks = ctr[i].unit_mask;
 782                 pmc_cntrl[0][i].enabled = ctr[i].enabled;
 783                 pmc_cntrl[0][i].vcntr = i;
 784
 785                 for_each_possible_cpu(j)
 786                         per_cpu(pmc_values, j)[i] = 0;
 787         }
 788
 789         /*
 790          * Setup the thread 1 events, map the thread 0 event to the
 791          * equivalent thread 1 event.
 792          */
 793         for (i = 0; i < num_ctrs; ++i) {
 794                 if ((ctr[i].event >= 2100) && (ctr[i].event <= 2111))
 795                         pmc_cntrl[1][i].evnts = ctr[i].event + 19;
 796                 else if (ctr[i].event == 2203)
 797                         pmc_cntrl[1][i].evnts = ctr[i].event;
 798                 else if ((ctr[i].event >= 2200) && (ctr[i].event <= 2215))
 799                         pmc_cntrl[1][i].evnts = ctr[i].event + 16;
 800                 else
 801                         pmc_cntrl[1][i].evnts = ctr[i].event;
 802
 803                 pmc_cntrl[1][i].masks = ctr[i].unit_mask;
 804                 pmc_cntrl[1][i].enabled = ctr[i].enabled;
 805                 pmc_cntrl[1][i].vcntr = i;
 806         }
 807
 808         for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
 809                 input_bus[i] = 0xff;
 810
 811         /*
 812          * Our counters count up, and "count" refers to
 813          * how much before the next interrupt, and we interrupt
 814          * on overflow.  So we calculate the starting value
 815          * which will give us "count" until overflow.
 816          * Then we set the events on the enabled counters.
 817          */
 818         for (i = 0; i < num_counters; ++i) {
 819                 /* start with virtual counter set 0 */
 820                 if (pmc_cntrl[0][i].enabled) {
 821                         /* Using 32bit counters, reset max - count */
 822                         reset_value[i] = 0xFFFFFFFF - ctr[i].count;
 823                         set_pm_event(i,
 824                                      pmc_cntrl[0][i].evnts,
 825                                      pmc_cntrl[0][i].masks);
 826
 827                         /* global, used by cell_cpu_setup */
 828                         ctr_enabled |= (1 << i);
 829                 }
 830         }
 831
 832         /* initialize the previous counts for the virtual cntrs */
 833         for_each_online_cpu(cpu)
 834                 for (i = 0; i < num_counters; ++i) {
 835                         per_cpu(pmc_values, cpu)[i] = reset_value[i];
 836                 }
 837
 838         return 0;
 839 }
 840
 841
 842 /* This function is called once for all cpus combined */
 843 static int cell_reg_setup(struct op_counter_config *ctr,
 844                         struct op_system_config *sys, int num_ctrs)
 845 {
 846         int ret=0;
 847         spu_cycle_reset = 0;
 848
 849         /* initialize the spu_arr_trace value, will be reset if
 850          * doing spu event profiling.
 851          */
 852         pm_regs.group_control = 0;
 853         pm_regs.debug_bus_control = 0;
 854         pm_regs.pm_cntrl.stop_at_max = 1;
 855         pm_regs.pm_cntrl.trace_mode = 0;
 856         pm_regs.pm_cntrl.freeze = 1;
 857         pm_regs.pm_cntrl.trace_buf_ovflw = 0;
 858         pm_regs.pm_cntrl.spu_addr_trace = 0;
 859
 860         /*
 861          * For all events except PPU CYCLEs, each node will need to make
 862          * the rtas cbe-perftools call to setup and reset the debug bus.
 863          * Make the token lookup call once and store it in the global
 864          * variable pm_rtas_token.
 865          */
 866         pm_rtas_token = rtas_token("ibm,cbe-perftools");
 867
 868         if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
 869                 printk(KERN_ERR
 870                        "%s: rtas token ibm,cbe-perftools unknown\n",
 871                        __func__);
 872                 return -EIO;
 873         }
 874
 875         if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
 876                 profiling_mode = SPU_PROFILING_CYCLES;
 877                 ret = cell_reg_setup_spu_cycles(ctr, sys, num_ctrs);
 878         } else if ((ctr[0].event >= SPU_EVENT_NUM_START) &&
 879                    (ctr[0].event <= SPU_EVENT_NUM_STOP)) {
 880                 profiling_mode = SPU_PROFILING_EVENTS;
 881                 spu_cycle_reset = ctr[0].count;
 882
 883                 /* for SPU event profiling, need to setup the
 884                  * pm_signal array with the events to route the
 885                  * SPU PC before making the FW call.  Note, only
 886                  * one SPU event for profiling can be specified
 887                  * at a time.
 888                  */
 889                 cell_reg_setup_spu_events(ctr, sys, num_ctrs);
 890         } else {
 891                 profiling_mode = PPU_PROFILING;
 892                 ret = cell_reg_setup_ppu(ctr, sys, num_ctrs);
 893         }
 894
 895         return ret;
 896 }
 897
 898
 899
 900 /* This function is called once for each cpu */
 901 static int cell_cpu_setup(struct op_counter_config *cntr)
 902 {
 903         u32 cpu = smp_processor_id();
 904         u32 num_enabled = 0;
 905         int i;
 906         int ret;
 907
 908         /* Cycle based SPU profiling does not use the performance
 909          * counters.  The trace array is configured to collect
 910          * the data.
 911          */
 912         if (profiling_mode == SPU_PROFILING_CYCLES)
 913                 return 0;
 914
 915         /* There is one performance monitor per processor chip (i.e. node),
 916          * so we only need to perform this function once per node.
 917          */
 918         if (cbe_get_hw_thread_id(cpu))
 919                 return 0;
 920
 921         /* Stop all counters */
 922         cbe_disable_pm(cpu);
 923         cbe_disable_pm_interrupts(cpu);
 924
 925         cbe_write_pm(cpu, pm_start_stop, 0);
 926         cbe_write_pm(cpu, group_control, pm_regs.group_control);
 927         cbe_write_pm(cpu, debug_bus_control, pm_regs.debug_bus_control);
 928         write_pm_cntrl(cpu);
 929
 930         for (i = 0; i < num_counters; ++i) {
 931                 if (ctr_enabled & (1 << i)) {
 932                         pm_signal[num_enabled].cpu = cbe_cpu_to_node(cpu);
 933                         num_enabled++;
 934                 }
 935         }
 936
 937         /*
 938          * The pm_rtas_activate_signals will return -EIO if the FW
 939          * call failed.
 940          */
 941         if (profiling_mode == SPU_PROFILING_EVENTS) {
 942                 /* For SPU event profiling also need to setup the
 943                  * pm interval timer
 944                  */
 945                 ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
 946                                                num_enabled+2);
 947                 /* store PC from debug bus to Trace buffer as often
 948                  * as possible (every 10 cycles)
 949                  */
 950                 cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
 951                 return ret;
 952         } else
 953                 return pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
 954                                                 num_enabled);
 955 }
 956
 957 #define ENTRIES  303
 958 #define MAXLFSR  0xFFFFFF
 959
 960 /* precomputed table of 24 bit LFSR values */
 961 static int initial_lfsr[] = {
 962  8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
 963  15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
 964  4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
 965  3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
 966  9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026,
 967  2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556,
 968  3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769,
 969  14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893,
 970  11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017,
 971  6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756,
 972  15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558,
 973  7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401,
 974  16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720,
 975  15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042,
 976  15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955,
 977  10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934,
 978  3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783,
 979  3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278,
 980  8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051,
 981  8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741,
 982  4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972,
 983  16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302,
 984  2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384,
 985  14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469,
 986  1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697,
 987  6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398,
 988  10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140,
 989  10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214,
 990  14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386,
 991  7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087,
 992  9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130,
 993  14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300,
 994  13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475,
 995  5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950,
 996  3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
 997  6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
 998  7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
 999  6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
1000 };
1001
1002 /*
1003  * The hardware uses an LFSR counting sequence to determine when to capture
1004  * the SPU PCs.  An LFSR sequence is like a puesdo random number sequence
1005  * where each number occurs once in the sequence but the sequence is not in
1006  * numerical order. The SPU PC capture is done when the LFSR sequence reaches
1007  * the last value in the sequence.  Hence the user specified value N
1008  * corresponds to the LFSR number that is N from the end of the sequence.
1009  *
1010  * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
1011  * LFSR sequence is broken into four ranges.  The spacing of the precomputed
1012  * values is adjusted in each range so the error between the user specifed
1013  * number (N) of events between samples and the actual number of events based
1014  * on the precomputed value will be les then about 6.2%.  Note, if the user
1015  * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
1016  * This is to prevent the loss of samples because the trace buffer is full.
1017  *
1018  *         User specified N                  Step between          Index in
1019  *                                       precomputed values      precomputed
1020  *                                                                  table
1021  * 0                to  2^16-1                  ----                  0
1022  * 2^16     to  2^16+2^19-1             2^12                1 to 128
1023  * 2^16+2^19        to  2^16+2^19+2^22-1        2^15              129 to 256
1024  * 2^16+2^19+2^22  to   2^24-1                  2^18              257 to 302
1025  *
1026  *
1027  * For example, the LFSR values in the second range are computed for 2^16,
1028  * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies
1029  * 1, 2,..., 127, 128.
1030  *
1031  * The 24 bit LFSR value for the nth number in the sequence can be
1032  * calculated using the following code:
1033  *
1034  * #define size 24
1035  * int calculate_lfsr(int n)
1036  * {
1037  *      int i;
1038  *      unsigned int newlfsr0;
1039  *      unsigned int lfsr = 0xFFFFFF;
1040  *      unsigned int howmany = n;
1041  *
1042  *      for (i = 2; i < howmany + 2; i++) {
1043  *              newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
1044  *              ((lfsr >> (size - 1 - 1)) & 1) ^
1045  *              (((lfsr >> (size - 1 - 6)) & 1) ^
1046  *              ((lfsr >> (size - 1 - 23)) & 1)));
1047  *
1048  *              lfsr >>= 1;
1049  *              lfsr = lfsr | (newlfsr0 << (size - 1));
1050  *      }
1051  *      return lfsr;
1052  * }
1053  */
1054
1055 #define V2_16  (0x1 << 16)
1056 #define V2_19  (0x1 << 19)
1057 #define V2_22  (0x1 << 22)
1058
1059 static int calculate_lfsr(int n)
1060 {
1061         /*
1062          * The ranges and steps are in powers of 2 so the calculations
1063          * can be done using shifts rather then divide.
1064          */
1065         int index;
1066
1067         if ((n >> 16) == 0)
1068                 index = 0;
1069         else if (((n - V2_16) >> 19) == 0)
1070                 index = ((n - V2_16) >> 12) + 1;
1071         else if (((n - V2_16 - V2_19) >> 22) == 0)
1072                 index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
1073         else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
1074                 index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
1075         else
1076                 index = ENTRIES-1;
1077
1078         /* make sure index is valid */
1079         if ((index >= ENTRIES) || (index < 0))
1080                 index = ENTRIES-1;
1081
1082         return initial_lfsr[index];
1083 }
1084
1085 static int pm_rtas_activate_spu_profiling(u32 node)
1086 {
1087         int ret, i;
1088         struct pm_signal pm_signal_local[NUM_SPUS_PER_NODE];
1089
1090         /*
1091          * Set up the rtas call to configure the debug bus to
1092          * route the SPU PCs.  Setup the pm_signal for each SPU
1093          */
1094         for (i = 0; i < ARRAY_SIZE(pm_signal_local); i++) {
1095                 pm_signal_local[i].cpu = node;
1096                 pm_signal_local[i].signal_group = 41;
1097                 /* spu i on word (i/2) */
1098                 pm_signal_local[i].bus_word = 1 << i / 2;
1099                 /* spu i */
1100                 pm_signal_local[i].sub_unit = i;
1101                 pm_signal_local[i].bit = 63;
1102         }
1103
1104         ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE,
1105                                      PASSTHRU_ENABLE, pm_signal_local,
1106                                      (ARRAY_SIZE(pm_signal_local)
1107                                       * sizeof(struct pm_signal)));
1108
1109         if (unlikely(ret)) {
1110                 printk(KERN_WARNING "%s: rtas returned: %d\n",
1111                        __func__, ret);
1112                 return -EIO;
1113         }
1114
1115         return 0;
1116 }
1117
1118 #ifdef CONFIG_CPU_FREQ
1119 static int
1120 oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
1121 {
1122         int ret = 0;
1123         struct cpufreq_freqs *frq = data;
1124         if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
1125             (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
1126             (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
1127                 set_spu_profiling_frequency(frq->new, spu_cycle_reset);
1128         return ret;
1129 }
1130
1131 static struct notifier_block cpu_freq_notifier_block = {
1132         .notifier_call  = oprof_cpufreq_notify
1133 };
1134 #endif
1135
1136 /*
1137  * Note the generic OProfile stop calls do not support returning
1138  * an error on stop.  Hence, will not return an error if the FW
1139  * calls fail on stop.  Failure to reset the debug bus is not an issue.
1140  * Failure to disable the SPU profiling is not an issue.  The FW calls
1141  * to enable the performance counters and debug bus will work even if
1142  * the hardware was not cleanly reset.
1143  */
1144 static void cell_global_stop_spu_cycles(void)
1145 {
1146         int subfunc, rtn_value;
1147         unsigned int lfsr_value;
1148         int cpu;
1149
1150         oprofile_running = 0;
1151         smp_wmb();
1152
1153 #ifdef CONFIG_CPU_FREQ
1154         cpufreq_unregister_notifier(&cpu_freq_notifier_block,
1155                                     CPUFREQ_TRANSITION_NOTIFIER);
1156 #endif
1157
1158         for_each_online_cpu(cpu) {
1159                 if (cbe_get_hw_thread_id(cpu))
1160                         continue;
1161
1162                 subfunc = 3;    /*
1163                                  * 2 - activate SPU tracing,
1164                                  * 3 - deactivate
1165                                  */
1166                 lfsr_value = 0x8f100000;
1167
1168                 rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
1169                                       subfunc, cbe_cpu_to_node(cpu),
1170                                       lfsr_value);
1171
1172                 if (unlikely(rtn_value != 0)) {
1173                         printk(KERN_ERR
1174                                "%s: rtas call ibm,cbe-spu-perftools " \
1175                                "failed, return = %d\n",
1176                                __func__, rtn_value);
1177                 }
1178
1179                 /* Deactivate the signals */
1180                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1181         }
1182
1183         stop_spu_profiling_cycles();
1184 }
1185
1186 static void cell_global_stop_spu_events(void)
1187 {
1188         int cpu;
1189         oprofile_running = 0;
1190
1191         stop_spu_profiling_events();
1192         smp_wmb();
1193
1194         for_each_online_cpu(cpu) {
1195                 if (cbe_get_hw_thread_id(cpu))
1196                         continue;
1197
1198                 cbe_sync_irq(cbe_cpu_to_node(cpu));
1199                 /* Stop the counters */
1200                 cbe_disable_pm(cpu);
1201                 cbe_write_pm07_control(cpu, 0, 0);
1202
1203                 /* Deactivate the signals */
1204                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1205
1206                 /* Deactivate interrupts */
1207                 cbe_disable_pm_interrupts(cpu);
1208         }
1209         del_timer_sync(&timer_spu_event_swap);
1210 }
1211
1212 static void cell_global_stop_ppu(void)
1213 {
1214         int cpu;
1215
1216         /*
1217          * This routine will be called once for the system.
1218          * There is one performance monitor per node, so we
1219          * only need to perform this function once per node.
1220          */
1221         del_timer_sync(&timer_virt_cntr);
1222         oprofile_running = 0;
1223         smp_wmb();
1224
1225         for_each_online_cpu(cpu) {
1226                 if (cbe_get_hw_thread_id(cpu))
1227                         continue;
1228
1229                 cbe_sync_irq(cbe_cpu_to_node(cpu));
1230                 /* Stop the counters */
1231                 cbe_disable_pm(cpu);
1232
1233                 /* Deactivate the signals */
1234                 pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1235
1236                 /* Deactivate interrupts */
1237                 cbe_disable_pm_interrupts(cpu);
1238         }
1239 }
1240
1241 static void cell_global_stop(void)
1242 {
1243         if (profiling_mode == PPU_PROFILING)
1244                 cell_global_stop_ppu();
1245         else if (profiling_mode == SPU_PROFILING_EVENTS)
1246                 cell_global_stop_spu_events();
1247         else
1248                 cell_global_stop_spu_cycles();
1249 }
1250
1251 static int cell_global_start_spu_cycles(struct op_counter_config *ctr)
1252 {
1253         int subfunc;
1254         unsigned int lfsr_value;
1255         int cpu;
1256         int ret;
1257         int rtas_error;
1258         unsigned int cpu_khzfreq = 0;
1259
1260         /* The SPU profiling uses time-based profiling based on
1261          * cpu frequency, so if configured with the CPU_FREQ
1262          * option, we should detect frequency changes and react
1263          * accordingly.
1264          */
1265 #ifdef CONFIG_CPU_FREQ
1266         ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
1267                                         CPUFREQ_TRANSITION_NOTIFIER);
1268         if (ret < 0)
1269                 /* this is not a fatal error */
1270                 printk(KERN_ERR "CPU freq change registration failed: %d\n",
1271                        ret);
1272
1273         else
1274                 cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
1275 #endif
1276
1277         set_spu_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
1278
1279         for_each_online_cpu(cpu) {
1280                 if (cbe_get_hw_thread_id(cpu))
1281                         continue;
1282
1283                 /*
1284                  * Setup SPU cycle-based profiling.
1285                  * Set perf_mon_control bit 0 to a zero before
1286                  * enabling spu collection hardware.
1287                  */
1288                 cbe_write_pm(cpu, pm_control, 0);
1289
1290                 if (spu_cycle_reset > MAX_SPU_COUNT)
1291                         /* use largest possible value */
1292                         lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
1293                 else
1294                         lfsr_value = calculate_lfsr(spu_cycle_reset);
1295
1296                 /* must use a non zero value. Zero disables data collection. */
1297                 if (lfsr_value == 0)
1298                         lfsr_value = calculate_lfsr(1);
1299
1300                 lfsr_value = lfsr_value << 8; /* shift lfsr to correct
1301                                                 * register location
1302                                                 */
1303
1304                 /* debug bus setup */
1305                 ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
1306
1307                 if (unlikely(ret)) {
1308                         rtas_error = ret;
1309                         goto out;
1310                 }
1311
1312
1313                 subfunc = 2;    /* 2 - activate SPU tracing, 3 - deactivate */
1314
1315                 /* start profiling */
1316                 ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
1317                                 cbe_cpu_to_node(cpu), lfsr_value);
1318
1319                 if (unlikely(ret != 0)) {
1320                         printk(KERN_ERR
1321                                "%s: rtas call ibm,cbe-spu-perftools failed, " \
1322                                "return = %d\n", __func__, ret);
1323                         rtas_error = -EIO;
1324                         goto out;
1325                 }
1326         }
1327
1328         rtas_error = start_spu_profiling_cycles(spu_cycle_reset);
1329         if (rtas_error)
1330                 goto out_stop;
1331
1332         oprofile_running = 1;
1333         return 0;
1334
1335 out_stop:
1336         cell_global_stop_spu_cycles();  /* clean up the PMU/debug bus */
1337 out:
1338         return rtas_error;
1339 }
1340
1341 static int cell_global_start_spu_events(struct op_counter_config *ctr)
1342 {
1343         int cpu;
1344         u32 interrupt_mask = 0;
1345         int rtn = 0;
1346
1347         hdw_thread = 0;
1348
1349         /* spu event profiling, uses the performance counters to generate
1350          * an interrupt.  The hardware is setup to store the SPU program
1351          * counter into the trace array.  The occurrence mode is used to
1352          * enable storing data to the trace buffer.  The bits are set
1353          * to send/store the SPU address in the trace buffer.  The debug
1354          * bus must be setup to route the SPU program counter onto the
1355          * debug bus.  The occurrence data in the trace buffer is not used.
1356          */
1357
1358         /* This routine gets called once for the system.
1359          * There is one performance monitor per node, so we
1360          * only need to perform this function once per node.
1361          */
1362
1363         for_each_online_cpu(cpu) {
1364                 if (cbe_get_hw_thread_id(cpu))
1365                         continue;
1366
1367                 /*
1368                  * Setup SPU event-based profiling.
1369                  * Set perf_mon_control bit 0 to a zero before
1370                  * enabling spu collection hardware.
1371                  *
1372                  * Only support one SPU event on one SPU per node.
1373                  */
1374                 if (ctr_enabled & 1) {
1375                         cbe_write_ctr(cpu, 0, reset_value[0]);
1376                         enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
1377                         interrupt_mask |=
1378                                 CBE_PM_CTR_OVERFLOW_INTR(0);
1379                 } else {
1380                         /* Disable counter */
1381                         cbe_write_pm07_control(cpu, 0, 0);
1382                 }
1383
1384                 cbe_get_and_clear_pm_interrupts(cpu);
1385                 cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask);
1386                 cbe_enable_pm(cpu);
1387
1388                 /* clear the trace buffer */
1389                 cbe_write_pm(cpu, trace_address, 0);
1390         }
1391
1392         /* Start the timer to time slice collecting the event profile
1393          * on each of the SPUs.  Note, can collect profile on one SPU
1394          * per node at a time.
1395          */
1396         start_spu_event_swap();
1397         start_spu_profiling_events();
1398         oprofile_running = 1;
1399         smp_wmb();
1400
1401         return rtn;
1402 }
1403
1404 static int cell_global_start_ppu(struct op_counter_config *ctr)
1405 {
1406         u32 cpu, i;
1407         u32 interrupt_mask = 0;
1408
1409         /* This routine gets called once for the system.
1410          * There is one performance monitor per node, so we
1411          * only need to perform this function once per node.
1412          */
1413         for_each_online_cpu(cpu) {
1414                 if (cbe_get_hw_thread_id(cpu))
1415                         continue;
1416
1417                 interrupt_mask = 0;
1418
1419                 for (i = 0; i < num_counters; ++i) {
1420                         if (ctr_enabled & (1 << i)) {
1421                                 cbe_write_ctr(cpu, i, reset_value[i]);
1422                                 enable_ctr(cpu, i, pm_regs.pm07_cntrl);
1423                                 interrupt_mask |= CBE_PM_CTR_OVERFLOW_INTR(i);
1424                         } else {
1425                                 /* Disable counter */
1426                                 cbe_write_pm07_control(cpu, i, 0);
1427                         }
1428                 }
1429
1430                 cbe_get_and_clear_pm_interrupts(cpu);
1431                 cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask);
1432                 cbe_enable_pm(cpu);
1433         }
1434
1435         virt_cntr_inter_mask = interrupt_mask;
1436         oprofile_running = 1;
1437         smp_wmb();
1438
1439         /*
1440          * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
1441          * executed which manipulates the PMU.  We start the "virtual counter"
1442          * here so that we do not need to synchronize access to the PMU in
1443          * the above for-loop.
1444          */
1445         start_virt_cntrs();
1446
1447         return 0;
1448 }
1449
1450 static int cell_global_start(struct op_counter_config *ctr)
1451 {
1452         if (profiling_mode == SPU_PROFILING_CYCLES)
1453                 return cell_global_start_spu_cycles(ctr);
1454         else if (profiling_mode == SPU_PROFILING_EVENTS)
1455                 return cell_global_start_spu_events(ctr);
1456         else
1457                 return cell_global_start_ppu(ctr);
1458 }
1459
1460
1461 /* The SPU interrupt handler
1462  *
1463  * SPU event profiling works as follows:
1464  * The pm_signal[0] holds the one SPU event to be measured.  It is routed on
1465  * the debug bus using word 0 or 1.  The value of pm_signal[1] and
1466  * pm_signal[2] contain the necessary events to route the SPU program
1467  * counter for the selected SPU onto the debug bus using words 2 and 3.
1468  * The pm_interval register is setup to write the SPU PC value into the
1469  * trace buffer at the maximum rate possible.  The trace buffer is configured
1470  * to store the PCs, wrapping when it is full.  The performance counter is
1471  * initialized to the max hardware count minus the number of events, N, between
1472  * samples.  Once the N events have occurred, a HW counter overflow occurs
1473  * causing the generation of a HW counter interrupt which also stops the
1474  * writing of the SPU PC values to the trace buffer.  Hence the last PC
1475  * written to the trace buffer is the SPU PC that we want.  Unfortunately,
1476  * we have to read from the beginning of the trace buffer to get to the
1477  * last value written.  We just hope the PPU has nothing better to do then
1478  * service this interrupt. The PC for the specific SPU being profiled is
1479  * extracted from the trace buffer processed and stored.  The trace buffer
1480  * is cleared, interrupts are cleared, the counter is reset to max - N.
1481  * A kernel timer is used to periodically call the routine spu_evnt_swap()
1482  * to switch to the next physical SPU in the node to profile in round robbin
1483  * order.  This way data is collected for all SPUs on the node. It does mean
1484  * that we need to use a relatively small value of N to ensure enough samples
1485  * on each SPU are collected each SPU is being profiled 1/8 of the time.
1486  * It may also be necessary to use a longer sample collection period.
1487  */
1488 static void cell_handle_interrupt_spu(struct pt_regs *regs,
1489                                       struct op_counter_config *ctr)
1490 {
1491         u32 cpu, cpu_tmp;
1492         u64 trace_entry;
1493         u32 interrupt_mask;
1494         u64 trace_buffer[2];
1495         u64 last_trace_buffer;
1496         u32 sample;
1497         u32 trace_addr;
1498         unsigned long sample_array_lock_flags;
1499         int spu_num;
1500         unsigned long flags;
1501
1502         /* Make sure spu event interrupt handler and spu event swap
1503          * don't access the counters simultaneously.
1504          */
1505         cpu = smp_processor_id();
1506         spin_lock_irqsave(&cntr_lock, flags);
1507
1508         cpu_tmp = cpu;
1509         cbe_disable_pm(cpu);
1510
1511         interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
1512
1513         sample = 0xABCDEF;
1514         trace_entry = 0xfedcba;
1515         last_trace_buffer = 0xdeadbeaf;
1516
1517         if ((oprofile_running == 1) && (interrupt_mask != 0)) {
1518                 /* disable writes to trace buff */
1519                 cbe_write_pm(cpu, pm_interval, 0);
1520
1521                 /* only have one perf cntr being used, cntr 0 */
1522                 if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(0))
1523                     && ctr[0].enabled)
1524                         /* The SPU PC values will be read
1525                          * from the trace buffer, reset counter
1526                          */
1527
1528                         cbe_write_ctr(cpu, 0, reset_value[0]);
1529
1530                 trace_addr = cbe_read_pm(cpu, trace_address);
1531
1532                 while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
1533                         /* There is data in the trace buffer to process
1534                          * Read the buffer until you get to the last
1535                          * entry.  This is the value we want.
1536                          */
1537
1538                         cbe_read_trace_buffer(cpu, trace_buffer);
1539                         trace_addr = cbe_read_pm(cpu, trace_address);
1540                 }
1541
1542                 /* SPU Address 16 bit count format for 128 bit
1543                  * HW trace buffer is used for the SPU PC storage
1544                  *    HDR bits          0:15
1545                  *    SPU Addr 0 bits   16:31
1546                  *    SPU Addr 1 bits   32:47
1547                  *    unused bits       48:127
1548                  *
1549                  * HDR: bit4 = 1 SPU Address 0 valid
1550                  * HDR: bit5 = 1 SPU Address 1 valid
1551                  *  - unfortunately, the valid bits don't seem to work
1552                  *
1553                  * Note trace_buffer[0] holds bits 0:63 of the HW
1554                  * trace buffer, trace_buffer[1] holds bits 64:127
1555                  */
1556
1557                 trace_entry = trace_buffer[0]
1558                         & 0x00000000FFFF0000;
1559
1560                 /* only top 16 of the 18 bit SPU PC address
1561                  * is stored in trace buffer, hence shift right
1562                  * by 16 -2 bits */
1563                 sample = trace_entry >> 14;
1564                 last_trace_buffer = trace_buffer[0];
1565
1566                 spu_num = spu_evnt_phys_spu_indx
1567                         + (cbe_cpu_to_node(cpu) * NUM_SPUS_PER_NODE);
1568
1569                 /* make sure only one process at a time is calling
1570                  * spu_sync_buffer()
1571                  */
1572                 spin_lock_irqsave(&oprof_spu_smpl_arry_lck,
1573                                   sample_array_lock_flags);
1574                 spu_sync_buffer(spu_num, &sample, 1);
1575                 spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,
1576                                        sample_array_lock_flags);
1577
1578                 smp_wmb();    /* insure spu event buffer updates are written
1579                                * don't want events intermingled... */
1580
1581                 /* The counters were frozen by the interrupt.
1582                  * Reenable the interrupt and restart the counters.
1583                  */
1584                 cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
1585                 cbe_enable_pm_interrupts(cpu, hdw_thread,
1586                                          virt_cntr_inter_mask);
1587
1588                 /* clear the trace buffer, re-enable writes to trace buff */
1589                 cbe_write_pm(cpu, trace_address, 0);
1590                 cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
1591
1592                 /* The writes to the various performance counters only writes
1593                  * to a latch.  The new values (interrupt setting bits, reset
1594                  * counter value etc.) are not copied to the actual registers
1595                  * until the performance monitor is enabled.  In order to get
1596                  * this to work as desired, the performance monitor needs to
1597                  * be disabled while writing to the latches.  This is a
1598                  * HW design issue.
1599                  */
1600                 write_pm_cntrl(cpu);
1601                 cbe_enable_pm(cpu);
1602         }
1603         spin_unlock_irqrestore(&cntr_lock, flags);
1604 }
1605
1606 static void cell_handle_interrupt_ppu(struct pt_regs *regs,
1607                                       struct op_counter_config *ctr)
1608 {
1609         u32 cpu;
1610         u64 pc;
1611         int is_kernel;
1612         unsigned long flags = 0;
1613         u32 interrupt_mask;
1614         int i;
1615
1616         cpu = smp_processor_id();
1617
1618         /*
1619          * Need to make sure the interrupt handler and the virt counter
1620          * routine are not running at the same time. See the
1621          * cell_virtual_cntr() routine for additional comments.
1622          */
1623         spin_lock_irqsave(&cntr_lock, flags);
1624
1625         /*
1626          * Need to disable and reenable the performance counters
1627          * to get the desired behavior from the hardware.  This
1628          * is hardware specific.
1629          */
1630
1631         cbe_disable_pm(cpu);
1632
1633         interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
1634
1635         /*
1636          * If the interrupt mask has been cleared, then the virt cntr
1637          * has cleared the interrupt.  When the thread that generated
1638          * the interrupt is restored, the data count will be restored to
1639          * 0xffffff0 to cause the interrupt to be regenerated.
1640          */
1641
1642         if ((oprofile_running == 1) && (interrupt_mask != 0)) {
1643                 pc = regs->nip;
1644                 is_kernel = is_kernel_addr(pc);
1645
1646                 for (i = 0; i < num_counters; ++i) {
1647                         if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(i))
1648                             && ctr[i].enabled) {
1649                                 oprofile_add_ext_sample(pc, regs, i, is_kernel);
1650                                 cbe_write_ctr(cpu, i, reset_value[i]);
1651                         }
1652                 }
1653
1654                 /*
1655                  * The counters were frozen by the interrupt.
1656                  * Reenable the interrupt and restart the counters.
1657                  * If there was a race between the interrupt handler and
1658                  * the virtual counter routine.  The virtual counter
1659                  * routine may have cleared the interrupts.  Hence must
1660                  * use the virt_cntr_inter_mask to re-enable the interrupts.
1661                  */
1662                 cbe_enable_pm_interrupts(cpu, hdw_thread,
1663                                          virt_cntr_inter_mask);
1664
1665                 /*
1666                  * The writes to the various performance counters only writes
1667                  * to a latch.  The new values (interrupt setting bits, reset
1668                  * counter value etc.) are not copied to the actual registers
1669                  * until the performance monitor is enabled.  In order to get
1670                  * this to work as desired, the performance monitor needs to
1671                  * be disabled while writing to the latches.  This is a
1672                  * HW design issue.
1673                  */
1674                 cbe_enable_pm(cpu);
1675         }
1676         spin_unlock_irqrestore(&cntr_lock, flags);
1677 }
1678
1679 static void cell_handle_interrupt(struct pt_regs *regs,
1680                                   struct op_counter_config *ctr)
1681 {
1682         if (profiling_mode == PPU_PROFILING)
1683                 cell_handle_interrupt_ppu(regs, ctr);
1684         else
1685                 cell_handle_interrupt_spu(regs, ctr);
1686 }
1687
1688 /*
1689  * This function is called from the generic OProfile
1690  * driver.  When profiling PPUs, we need to do the
1691  * generic sync start; otherwise, do spu_sync_start.
1692  */
1693 static int cell_sync_start(void)
1694 {
1695         if ((profiling_mode == SPU_PROFILING_CYCLES) ||
1696             (profiling_mode == SPU_PROFILING_EVENTS))
1697                 return spu_sync_start();
1698         else
1699                 return DO_GENERIC_SYNC;
1700 }
1701
1702 static int cell_sync_stop(void)
1703 {
1704         if ((profiling_mode == SPU_PROFILING_CYCLES) ||
1705             (profiling_mode == SPU_PROFILING_EVENTS))
1706                 return spu_sync_stop();
1707         else
1708                 return 1;
1709 }
1710
1711 struct op_powerpc_model op_model_cell = {
1712         .reg_setup = cell_reg_setup,
1713         .cpu_setup = cell_cpu_setup,
1714         .global_start = cell_global_start,
1715         .global_stop = cell_global_stop,
1716         .sync_start = cell_sync_start,
1717         .sync_stop = cell_sync_stop,
1718         .handle_interrupt = cell_handle_interrupt,
1719 };