arch/sparc64/kernel/smp.c

   1 /* smp.c: Sparc64 SMP support.
   2  *
   3  * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
   4  */
   5
   6 #include <linux/kernel.h>
   7 #include <linux/sched.h>
   8 #include <linux/mm.h>
   9 #include <linux/pagemap.h>
  10 #include <linux/threads.h>
  11 #include <linux/smp.h>
  12 #include <linux/smp_lock.h>
  13 #include <linux/interrupt.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/delay.h>
  16 #include <linux/init.h>
  17 #include <linux/spinlock.h>
  18
  19 #include <asm/head.h>
  20 #include <asm/ptrace.h>
  21 #include <asm/atomic.h>
  22
  23 #include <asm/irq.h>
  24 #include <asm/page.h>
  25 #include <asm/pgtable.h>
  26 #include <asm/oplib.h>
  27 #include <asm/hardirq.h>
  28 #include <asm/softirq.h>
  29 #include <asm/uaccess.h>
  30 #include <asm/timer.h>
  31 #include <asm/starfire.h>
  32
  33 #define __KERNEL_SYSCALLS__
  34 #include <linux/unistd.h>
  35
  36 extern int linux_num_cpus;
  37 extern void calibrate_delay(void);
  38 extern unsigned prom_cpu_nodes[];
  39
  40 struct cpuinfo_sparc cpu_data[NR_CPUS]  __attribute__ ((aligned (64)));
  41
  42 volatile int __cpu_number_map[NR_CPUS]  __attribute__ ((aligned (64)));
  43 volatile int __cpu_logical_map[NR_CPUS] __attribute__ ((aligned (64)));
  44
  45 /* Please don't make this stuff initdata!!!  --DaveM */
  46 static unsigned char boot_cpu_id = 0;
  47 static int smp_activated = 0;
  48
  49 /* Kernel spinlock */
  50 spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
  51
  52 volatile int smp_processors_ready = 0;
  53 unsigned long cpu_present_map = 0;
  54 int smp_num_cpus = 1;
  55 int smp_threads_ready = 0;
  56
  57 void __init smp_setup(char *str, int *ints)
  58 {
  59         /* XXX implement me XXX */
  60 }
  61
  62 int smp_info(char *buf)
  63 {
  64         int len = 7, i;
  65
  66         strcpy(buf, "State:\n");
  67         for (i = 0; i < NR_CPUS; i++)
  68                 if(cpu_present_map & (1UL << i))
  69                         len += sprintf(buf + len,
  70                                         "CPU%d:\t\tonline\n", i);
  71         return len;
  72 }
  73
  74 int smp_bogo(char *buf)
  75 {
  76         int len = 0, i;
  77
  78         for (i = 0; i < NR_CPUS; i++)
  79                 if(cpu_present_map & (1UL << i))
  80                         len += sprintf(buf + len,
  81                                        "Cpu%dBogo\t: %lu.%02lu\n",
  82                                        i, cpu_data[i].udelay_val / 500000,
  83                                        (cpu_data[i].udelay_val / 5000) % 100);
  84         return len;
  85 }
  86
  87 void __init smp_store_cpu_info(int id)
  88 {
  89         int i;
  90
  91         /* multiplier and counter set by
  92            smp_setup_percpu_timer()  */
  93         cpu_data[id].udelay_val                 = loops_per_sec;
  94
  95         cpu_data[id].pgcache_size               = 0;
  96         cpu_data[id].pte_cache[0]               = NULL;
  97         cpu_data[id].pte_cache[1]               = NULL;
  98         cpu_data[id].pgdcache_size              = 0;
  99         cpu_data[id].pgd_cache                  = NULL;
 100         cpu_data[id].idle_volume                = 1;
 101
 102         for(i = 0; i < 16; i++)
 103                 cpu_data[id].irq_worklists[i] = 0;
 104 }
 105
 106 void __init smp_commence(void)
 107 {
 108 }
 109
 110 static void smp_setup_percpu_timer(void);
 111 static void smp_tune_scheduling(void);
 112
 113 static volatile unsigned long callin_flag = 0;
 114
 115 extern void inherit_locked_prom_mappings(int save_p);
 116 extern void cpu_probe(void);
 117
 118 void __init smp_callin(void)
 119 {
 120         int cpuid = hard_smp_processor_id();
 121         unsigned long pstate;
 122
 123         inherit_locked_prom_mappings(0);
 124
 125         __flush_cache_all();
 126         __flush_tlb_all();
 127
 128         cpu_probe();
 129
 130         /* Guarentee that the following sequences execute
 131          * uninterrupted.
 132          */
 133         __asm__ __volatile__("rdpr      %%pstate, %0\n\t"
 134                              "wrpr      %0, %1, %%pstate"
 135                              : "=r" (pstate)
 136                              : "i" (PSTATE_IE));
 137
 138         /* Set things up so user can access tick register for profiling
 139          * purposes.  Also workaround BB_ERRATA_1 by doing a dummy
 140          * read back of %tick after writing it.
 141          */
 142         __asm__ __volatile__("
 143         sethi   %%hi(0x80000000), %%g1
 144         ba,pt   %%xcc, 1f
 145          sllx   %%g1, 32, %%g1
 146         .align  64
 147 1:      rd      %%tick, %%g2
 148         add     %%g2, 6, %%g2
 149         andn    %%g2, %%g1, %%g2
 150         wrpr    %%g2, 0, %%tick
 151         rdpr    %%tick, %%g0"
 152         : /* no outputs */
 153         : /* no inputs */
 154         : "g1", "g2");
 155
 156         /* Restore PSTATE_IE. */
 157         __asm__ __volatile__("wrpr      %0, 0x0, %%pstate"
 158                              : /* no outputs */
 159                              : "r" (pstate));
 160
 161         smp_setup_percpu_timer();
 162
 163         __sti();
 164
 165         calibrate_delay();
 166         smp_store_cpu_info(cpuid);
 167         callin_flag = 1;
 168         __asm__ __volatile__("membar #Sync\n\t"
 169                              "flush  %%g6" : : : "memory");
 170
 171         /* Clear this or we will die instantly when we
 172          * schedule back to this idler...
 173          */
 174         current->thread.flags &= ~(SPARC_FLAG_NEWCHILD);
 175
 176         /* Attach to the address space of init_task. */
 177         atomic_inc(&init_mm.mm_count);
 178         current->active_mm = &init_mm;
 179
 180         while(!smp_processors_ready)
 181                 membar("#LoadLoad");
 182 }
 183
 184 extern int cpu_idle(void);
 185 extern void init_IRQ(void);
 186
 187 void initialize_secondary(void)
 188 {
 189 }
 190
 191 int start_secondary(void *unused)
 192 {
 193         trap_init();
 194         init_IRQ();
 195         smp_callin();
 196         return cpu_idle();
 197 }
 198
 199 void cpu_panic(void)
 200 {
 201         printk("CPU[%d]: Returns from cpu_idle!\n", smp_processor_id());
 202         panic("SMP bolixed\n");
 203 }
 204
 205 extern struct prom_cpuinfo linux_cpus[64];
 206
 207 extern unsigned long sparc64_cpu_startup;
 208
 209 /* The OBP cpu startup callback truncates the 3rd arg cookie to
 210  * 32-bits (I think) so to be safe we have it read the pointer
 211  * contained here so we work on >4GB machines. -DaveM
 212  */
 213 static struct task_struct *cpu_new_task = NULL;
 214
 215 void __init smp_boot_cpus(void)
 216 {
 217         int cpucount = 0, i;
 218
 219         printk("Entering UltraSMPenguin Mode...\n");
 220         __sti();
 221         smp_store_cpu_info(boot_cpu_id);
 222         smp_tune_scheduling();
 223         init_idle();
 224
 225         if(linux_num_cpus == 1)
 226                 return;
 227
 228         for(i = 0; i < NR_CPUS; i++) {
 229                 if(i == boot_cpu_id)
 230                         continue;
 231
 232                 if(cpu_present_map & (1UL << i)) {
 233                         unsigned long entry = (unsigned long)(&sparc64_cpu_startup);
 234                         unsigned long cookie = (unsigned long)(&cpu_new_task);
 235                         struct task_struct *p;
 236                         int timeout;
 237                         int no;
 238
 239                         prom_printf("Starting CPU %d... ", i);
 240                         kernel_thread(start_secondary, NULL, CLONE_PID);
 241                         cpucount++;
 242
 243                         p = init_task.prev_task;
 244                         init_tasks[cpucount] = p;
 245
 246                         p->processor = i;
 247                         p->has_cpu = 1; /* we schedule the first task manually */
 248
 249                         del_from_runqueue(p);
 250                         unhash_process(p);
 251
 252                         callin_flag = 0;
 253                         for (no = 0; no < linux_num_cpus; no++)
 254                                 if (linux_cpus[no].mid == i)
 255                                         break;
 256                         cpu_new_task = p;
 257                         prom_startcpu(linux_cpus[no].prom_node,
 258                                       entry, cookie);
 259                         for(timeout = 0; timeout < 5000000; timeout++) {
 260                                 if(callin_flag)
 261                                         break;
 262                                 udelay(100);
 263                         }
 264                         if(callin_flag) {
 265                                 __cpu_number_map[i] = cpucount;
 266                                 __cpu_logical_map[cpucount] = i;
 267                                 prom_cpu_nodes[i] = linux_cpus[no].prom_node;
 268                                 prom_printf("OK\n");
 269                         } else {
 270                                 cpucount--;
 271                                 printk("Processor %d is stuck.\n", i);
 272                                 prom_printf("FAILED\n");
 273                         }
 274                 }
 275                 if(!callin_flag) {
 276                         cpu_present_map &= ~(1UL << i);
 277                         __cpu_number_map[i] = -1;
 278                 }
 279         }
 280         cpu_new_task = NULL;
 281         if(cpucount == 0) {
 282                 printk("Error: only one processor found.\n");
 283                 cpu_present_map = (1UL << smp_processor_id());
 284         } else {
 285                 unsigned long bogosum = 0;
 286
 287                 for(i = 0; i < NR_CPUS; i++) {
 288                         if(cpu_present_map & (1UL << i))
 289                                 bogosum += cpu_data[i].udelay_val;
 290                 }
 291                 printk("Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 292                        cpucount + 1,
 293                        (bogosum + 2500)/500000,
 294                        ((bogosum + 2500)/5000)%100);
 295                 smp_activated = 1;
 296                 smp_num_cpus = cpucount + 1;
 297         }
 298         smp_processors_ready = 1;
 299         membar("#StoreStore | #StoreLoad");
 300 }
 301
 302 /* #define XCALL_DEBUG */
 303
 304 static inline void xcall_deliver(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu)
 305 {
 306         u64 result, target;
 307         int stuck, tmp;
 308
 309         if (this_is_starfire) {
 310                 /* map to real upaid */
 311                 cpu = (((cpu & 0x3c) << 1) |
 312                         ((cpu & 0x40) >> 4) |
 313                         (cpu & 0x3));
 314         }
 315
 316         target = (cpu << 14) | 0x70;
 317 #ifdef XCALL_DEBUG
 318         printk("CPU[%d]: xcall(data[%016lx:%016lx:%016lx],tgt[%016lx])\n",
 319                smp_processor_id(), data0, data1, data2, target);
 320 #endif
 321 again:
 322         /* Ok, this is the real Spitfire Errata #54.
 323          * One must read back from a UDB internal register
 324          * after writes to the UDB interrupt dispatch, but
 325          * before the membar Sync for that write.
 326          * So we use the high UDB control register (ASI 0x7f,
 327          * ADDR 0x20) for the dummy read. -DaveM
 328          */
 329         tmp = 0x40;
 330         __asm__ __volatile__("
 331         wrpr    %1, %2, %%pstate
 332         stxa    %4, [%0] %3
 333         stxa    %5, [%0+%8] %3
 334         add     %0, %8, %0
 335         stxa    %6, [%0+%8] %3
 336         membar  #Sync
 337         stxa    %%g0, [%7] %3
 338         membar  #Sync
 339         mov     0x20, %%g1
 340         ldxa    [%%g1] 0x7f, %%g0
 341         membar  #Sync"
 342         : "=r" (tmp)
 343         : "r" (pstate), "i" (PSTATE_IE), "i" (ASI_UDB_INTR_W),
 344           "r" (data0), "r" (data1), "r" (data2), "r" (target), "r" (0x10), "0" (tmp)
 345        : "g1");
 346
 347         /* NOTE: PSTATE_IE is still clear. */
 348         stuck = 100000;
 349         do {
 350                 __asm__ __volatile__("ldxa [%%g0] %1, %0"
 351                         : "=r" (result)
 352                         : "i" (ASI_INTR_DISPATCH_STAT));
 353                 if(result == 0) {
 354                         __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 355                                              : : "r" (pstate));
 356                         return;
 357                 }
 358                 stuck -= 1;
 359                 if(stuck == 0)
 360                         break;
 361         } while(result & 0x1);
 362         __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 363                              : : "r" (pstate));
 364         if(stuck == 0) {
 365 #ifdef XCALL_DEBUG
 366                 printk("CPU[%d]: mondo stuckage result[%016lx]\n",
 367                        smp_processor_id(), result);
 368 #endif
 369         } else {
 370 #ifdef XCALL_DEBUG
 371                 printk("CPU[%d]: Penguin %d NACK's master.\n", smp_processor_id(), cpu);
 372 #endif
 373                 udelay(2);
 374                 goto again;
 375         }
 376 }
 377
 378 void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2)
 379 {
 380         if(smp_processors_ready) {
 381                 unsigned long mask = (cpu_present_map & ~(1UL<<smp_processor_id()));
 382                 u64 pstate, data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
 383                 int i, ncpus = smp_num_cpus - 1;
 384
 385                 __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 386                 for(i = 0; i < NR_CPUS; i++) {
 387                         if(mask & (1UL << i)) {
 388                                 xcall_deliver(data0, data1, data2, pstate, i);
 389                                 ncpus--;
 390                         }
 391                         if (!ncpus) break;
 392                 }
 393                 /* NOTE: Caller runs local copy on master. */
 394         }
 395 }
 396
 397 struct call_data_struct {
 398         void (*func) (void *info);
 399         void *info;
 400         atomic_t finished;
 401         int wait;
 402 };
 403
 404 extern unsigned long xcall_call_function;
 405
 406 int smp_call_function(void (*func)(void *info), void *info,
 407                       int nonatomic, int wait)
 408 {
 409         struct call_data_struct data;
 410         int cpus = smp_num_cpus - 1;
 411
 412         if (!cpus)
 413                 return 0;
 414
 415         data.func = func;
 416         data.info = info;
 417         atomic_set(&data.finished, 0);
 418         data.wait = wait;
 419
 420         smp_cross_call(&xcall_call_function,
 421                        0, (u64) &data, 0);
 422         if (wait) {
 423                 while (atomic_read(&data.finished) != cpus)
 424                         barrier();
 425         }
 426
 427         return 0;
 428 }
 429
 430 void smp_call_function_client(struct call_data_struct *call_data)
 431 {
 432         call_data->func(call_data->info);
 433         if (call_data->wait)
 434                 atomic_inc(&call_data->finished);
 435 }
 436
 437 extern unsigned long xcall_flush_tlb_page;
 438 extern unsigned long xcall_flush_tlb_mm;
 439 extern unsigned long xcall_flush_tlb_range;
 440 extern unsigned long xcall_flush_tlb_all;
 441 extern unsigned long xcall_tlbcachesync;
 442 extern unsigned long xcall_flush_cache_all;
 443 extern unsigned long xcall_report_regs;
 444 extern unsigned long xcall_receive_signal;
 445
 446 void smp_receive_signal(int cpu)
 447 {
 448         if(smp_processors_ready &&
 449            (cpu_present_map & (1UL<<cpu)) != 0) {
 450                 u64 pstate, data0 = (((u64)&xcall_receive_signal) & 0xffffffff);
 451                 __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 452                 xcall_deliver(data0, 0, 0, pstate, cpu);
 453         }
 454 }
 455
 456 void smp_report_regs(void)
 457 {
 458         smp_cross_call(&xcall_report_regs, 0, 0, 0);
 459 }
 460
 461 void smp_flush_cache_all(void)
 462 {
 463         smp_cross_call(&xcall_flush_cache_all, 0, 0, 0);
 464         __flush_cache_all();
 465 }
 466
 467 void smp_flush_tlb_all(void)
 468 {
 469         smp_cross_call(&xcall_flush_tlb_all, 0, 0, 0);
 470         __flush_tlb_all();
 471 }
 472
 473 /* We know that the window frames of the user have been flushed
 474  * to the stack before we get here because all callers of us
 475  * are flush_tlb_*() routines, and these run after flush_cache_*()
 476  * which performs the flushw.
 477  *
 478  * XXX I diked out the fancy flush avoidance code for the
 479  * XXX swapping cases for now until the new MM code stabilizes. -DaveM
 480  *
 481  * The SMP TLB coherency scheme we use works as follows:
 482  *
 483  * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
 484  *    space has (potentially) executed on, this is the heuristic
 485  *    we use to avoid doing cross calls.
 486  *
 487  * 2) TLB context numbers are shared globally across all processors
 488  *    in the system, this allows us to play several games to avoid
 489  *    cross calls.
 490  *
 491  *    One invariant is that when a cpu switches to a process, and
 492  *    that processes tsk->active_mm->cpu_vm_mask does not have the
 493  *    current cpu's bit set, that tlb context is flushed locally.
 494  *
 495  *    If the address space is non-shared (ie. mm->count == 1) we avoid
 496  *    cross calls when we want to flush the currently running process's
 497  *    tlb state.  This is done by clearing all cpu bits except the current
 498  *    processor's in current->active_mm->cpu_vm_mask and performing the
 499  *    flush locally only.  This will force any subsequent cpus which run
 500  *    this task to flush the context from the local tlb if the process
 501  *    migrates to another cpu (again).
 502  *
 503  * 3) For shared address spaces (threads) and swapping we bite the
 504  *    bullet for most cases and perform the cross call.
 505  *
 506  *    The performance gain from "optimizing" away the cross call for threads is
 507  *    questionable (in theory the big win for threads is the massive sharing of
 508  *    address space state across processors).
 509  *
 510  *    For the swapping case the locking is difficult to get right, we'd have to
 511  *    enforce strict ordered access to mm->cpu_vm_mask via a spinlock for example.
 512  *    Then again one could argue that when you are swapping, the cost of a cross
 513  *    call won't even show up on the performance radar.  But in any case we do get
 514  *    rid of the cross-call when the task has a dead context or the task has only
 515  *    ever run on the local cpu.
 516  */
 517 void smp_flush_tlb_mm(struct mm_struct *mm)
 518 {
 519         if (CTX_VALID(mm->context)) {
 520                 u32 ctx = CTX_HWBITS(mm->context);
 521                 int cpu = smp_processor_id();
 522
 523                 if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1) {
 524                         /* See smp_flush_tlb_page for info about this. */
 525                         mm->cpu_vm_mask = (1UL << cpu);
 526                         goto local_flush_and_out;
 527                 }
 528
 529                 smp_cross_call(&xcall_flush_tlb_mm, ctx, 0, 0);
 530
 531         local_flush_and_out:
 532                 __flush_tlb_mm(ctx, SECONDARY_CONTEXT);
 533         }
 534 }
 535
 536 void smp_flush_tlb_range(struct mm_struct *mm, unsigned long start,
 537                          unsigned long end)
 538 {
 539         if (CTX_VALID(mm->context)) {
 540                 u32 ctx = CTX_HWBITS(mm->context);
 541                 int cpu = smp_processor_id();
 542
 543                 start &= PAGE_MASK;
 544                 end   &= PAGE_MASK;
 545
 546                 if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1) {
 547                         mm->cpu_vm_mask = (1UL << cpu);
 548                         goto local_flush_and_out;
 549                 }
 550
 551                 smp_cross_call(&xcall_flush_tlb_range, ctx, start, end);
 552
 553         local_flush_and_out:
 554                 __flush_tlb_range(ctx, start, SECONDARY_CONTEXT, end, PAGE_SIZE, (end-start));
 555         }
 556 }
 557
 558 void smp_flush_tlb_page(struct mm_struct *mm, unsigned long page)
 559 {
 560         if (CTX_VALID(mm->context)) {
 561                 u32 ctx = CTX_HWBITS(mm->context);
 562                 int cpu = smp_processor_id();
 563
 564                 page &= PAGE_MASK;
 565                 if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1) {
 566                         /* By virtue of being the current address space, and
 567                          * having the only reference to it, the following operation
 568                          * is safe.
 569                          *
 570                          * It would not be a win to perform the xcall tlb flush in
 571                          * this case, because even if we switch back to one of the
 572                          * other processors in cpu_vm_mask it is almost certain that
 573                          * all TLB entries for this context will be replaced by the
 574                          * time that happens.
 575                          */
 576                         mm->cpu_vm_mask = (1UL << cpu);
 577                         goto local_flush_and_out;
 578                 } else {
 579                         /* By virtue of running under the mm->page_table_lock,
 580                          * and mmu_context.h:switch_mm doing the same, the following
 581                          * operation is safe.
 582                          */
 583                         if (mm->cpu_vm_mask == (1UL << cpu))
 584                                 goto local_flush_and_out;
 585                 }
 586
 587                 /* OK, we have to actually perform the cross call.  Most likely
 588                  * this is a cloned mm or kswapd is kicking out pages for a task
 589                  * which has run recently on another cpu.
 590                  */
 591                 smp_cross_call(&xcall_flush_tlb_page, ctx, page, 0);
 592
 593         local_flush_and_out:
 594                 __flush_tlb_page(ctx, page, SECONDARY_CONTEXT);
 595         }
 596 }
 597
 598 /* CPU capture. */
 599 /* #define CAPTURE_DEBUG */
 600 extern unsigned long xcall_capture;
 601
 602 static atomic_t smp_capture_depth = ATOMIC_INIT(0);
 603 static atomic_t smp_capture_registry = ATOMIC_INIT(0);
 604 static unsigned long penguins_are_doing_time = 0;
 605
 606 void smp_capture(void)
 607 {
 608         if (smp_processors_ready) {
 609                 int result = __atomic_add(1, &smp_capture_depth);
 610
 611                 membar("#StoreStore | #LoadStore");
 612                 if(result == 1) {
 613                         int ncpus = smp_num_cpus;
 614
 615 #ifdef CAPTURE_DEBUG
 616                         printk("CPU[%d]: Sending penguins to jail...",
 617                                smp_processor_id());
 618 #endif
 619                         penguins_are_doing_time = 1;
 620                         membar("#StoreStore | #LoadStore");
 621                         atomic_inc(&smp_capture_registry);
 622                         smp_cross_call(&xcall_capture, 0, 0, 0);
 623                         while(atomic_read(&smp_capture_registry) != ncpus)
 624                                 membar("#LoadLoad");
 625 #ifdef CAPTURE_DEBUG
 626                         printk("done\n");
 627 #endif
 628                 }
 629         }
 630 }
 631
 632 void smp_release(void)
 633 {
 634         if(smp_processors_ready) {
 635                 if(atomic_dec_and_test(&smp_capture_depth)) {
 636 #ifdef CAPTURE_DEBUG
 637                         printk("CPU[%d]: Giving pardon to imprisoned penguins\n",
 638                                smp_processor_id());
 639 #endif
 640                         penguins_are_doing_time = 0;
 641                         membar("#StoreStore | #StoreLoad");
 642                         atomic_dec(&smp_capture_registry);
 643                 }
 644         }
 645 }
 646
 647 /* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they
 648  * can service tlb flush xcalls...
 649  */
 650 extern void prom_world(int);
 651 extern void save_alternate_globals(unsigned long *);
 652 extern void restore_alternate_globals(unsigned long *);
 653 void smp_penguin_jailcell(void)
 654 {
 655         unsigned long global_save[24];
 656
 657         __asm__ __volatile__("flushw");
 658         save_alternate_globals(global_save);
 659         prom_world(1);
 660         atomic_inc(&smp_capture_registry);
 661         membar("#StoreLoad | #StoreStore");
 662         while(penguins_are_doing_time)
 663                 membar("#LoadLoad");
 664         restore_alternate_globals(global_save);
 665         atomic_dec(&smp_capture_registry);
 666         prom_world(0);
 667 }
 668
 669 extern unsigned long xcall_promstop;
 670
 671 void smp_promstop_others(void)
 672 {
 673         if (smp_processors_ready)
 674                 smp_cross_call(&xcall_promstop, 0, 0, 0);
 675 }
 676
 677 extern void sparc64_do_profile(unsigned long pc, unsigned long o7);
 678
 679 static unsigned long current_tick_offset;
 680
 681 #define prof_multiplier(__cpu)          cpu_data[(__cpu)].multiplier
 682 #define prof_counter(__cpu)             cpu_data[(__cpu)].counter
 683
 684 void smp_percpu_timer_interrupt(struct pt_regs *regs)
 685 {
 686         unsigned long compare, tick, pstate;
 687         int cpu = smp_processor_id();
 688         int user = user_mode(regs);
 689
 690         /*
 691          * Check for level 14 softint.
 692          */
 693         if (!(get_softint() & (1UL << 0))) {
 694                 extern void handler_irq(int, struct pt_regs *);
 695
 696                 handler_irq(14, regs);
 697                 return;
 698         }
 699
 700         clear_softint((1UL << 0));
 701         do {
 702                 if (!user)
 703                         sparc64_do_profile(regs->tpc, regs->u_regs[UREG_RETPC]);
 704                 if (!--prof_counter(cpu)) {
 705                         if (cpu == boot_cpu_id) {
 706                                 irq_enter(cpu, 0);
 707
 708                                 kstat.irqs[cpu][0]++;
 709                                 timer_tick_interrupt(regs);
 710
 711                                 irq_exit(cpu, 0);
 712                         }
 713
 714                         update_process_times(user);
 715
 716                         prof_counter(cpu) = prof_multiplier(cpu);
 717                 }
 718
 719                 /* Guarentee that the following sequences execute
 720                  * uninterrupted.
 721                  */
 722                 __asm__ __volatile__("rdpr      %%pstate, %0\n\t"
 723                                      "wrpr      %0, %1, %%pstate"
 724                                      : "=r" (pstate)
 725                                      : "i" (PSTATE_IE));
 726
 727                 /* Workaround for Spitfire Errata (#54 I think??), I discovered
 728                  * this via Sun BugID 4008234, mentioned in Solaris-2.5.1 patch
 729                  * number 103640.
 730                  *
 731                  * On Blackbird writes to %tick_cmpr can fail, the
 732                  * workaround seems to be to execute the wr instruction
 733                  * at the start of an I-cache line, and perform a dummy
 734                  * read back from %tick_cmpr right after writing to it. -DaveM
 735                  *
 736                  * Just to be anal we add a workaround for Spitfire
 737                  * Errata 50 by preventing pipeline bypasses on the
 738                  * final read of the %tick register into a compare
 739                  * instruction.  The Errata 50 description states
 740                  * that %tick is not prone to this bug, but I am not
 741                  * taking any chances.
 742                  */
 743                 __asm__ __volatile__("rd        %%tick_cmpr, %0\n\t"
 744                                      "ba,pt     %%xcc, 1f\n\t"
 745                                      " add      %0, %2, %0\n\t"
 746                                      ".align    64\n"
 747                                   "1: wr        %0, 0x0, %%tick_cmpr\n\t"
 748                                      "rd        %%tick_cmpr, %%g0\n\t"
 749                                      "rd        %%tick, %1\n\t"
 750                                      "mov       %1, %1"
 751                                      : "=&r" (compare), "=r" (tick)
 752                                      : "r" (current_tick_offset));
 753
 754                 /* Restore PSTATE_IE. */
 755                 __asm__ __volatile__("wrpr      %0, 0x0, %%pstate"
 756                                      : /* no outputs */
 757                                      : "r" (pstate));
 758         } while (tick >= compare);
 759 }
 760
 761 static void __init smp_setup_percpu_timer(void)
 762 {
 763         int cpu = smp_processor_id();
 764         unsigned long pstate;
 765
 766         prof_counter(cpu) = prof_multiplier(cpu) = 1;
 767
 768         /* Guarentee that the following sequences execute
 769          * uninterrupted.
 770          */
 771         __asm__ __volatile__("rdpr      %%pstate, %0\n\t"
 772                              "wrpr      %0, %1, %%pstate"
 773                              : "=r" (pstate)
 774                              : "i" (PSTATE_IE));
 775
 776         /* Workaround for Spitfire Errata (#54 I think??), I discovered
 777          * this via Sun BugID 4008234, mentioned in Solaris-2.5.1 patch
 778          * number 103640.
 779          *
 780          * On Blackbird writes to %tick_cmpr can fail, the
 781          * workaround seems to be to execute the wr instruction
 782          * at the start of an I-cache line, and perform a dummy
 783          * read back from %tick_cmpr right after writing to it. -DaveM
 784          */
 785         __asm__ __volatile__("
 786                 rd      %%tick, %%g1
 787                 ba,pt   %%xcc, 1f
 788                  add    %%g1, %0, %%g1
 789                 .align  64
 790         1:      wr      %%g1, 0x0, %%tick_cmpr
 791                 rd      %%tick_cmpr, %%g0"
 792         : /* no outputs */
 793         : "r" (current_tick_offset)
 794         : "g1");
 795
 796         /* Restore PSTATE_IE. */
 797         __asm__ __volatile__("wrpr      %0, 0x0, %%pstate"
 798                              : /* no outputs */
 799                              : "r" (pstate));
 800 }
 801
 802 void __init smp_tick_init(void)
 803 {
 804         int i;
 805
 806         boot_cpu_id = hard_smp_processor_id();
 807         current_tick_offset = timer_tick_offset;
 808         cpu_present_map = 0;
 809         for(i = 0; i < linux_num_cpus; i++)
 810                 cpu_present_map |= (1UL << linux_cpus[i].mid);
 811         for(i = 0; i < NR_CPUS; i++) {
 812                 __cpu_number_map[i] = -1;
 813                 __cpu_logical_map[i] = -1;
 814         }
 815         __cpu_number_map[boot_cpu_id] = 0;
 816         prom_cpu_nodes[boot_cpu_id] = linux_cpus[0].prom_node;
 817         __cpu_logical_map[0] = boot_cpu_id;
 818         current->processor = boot_cpu_id;
 819         prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
 820 }
 821
 822 static inline unsigned long find_flush_base(unsigned long size)
 823 {
 824         struct page *p = mem_map;
 825         unsigned long found, base;
 826
 827         size = PAGE_ALIGN(size);
 828         found = size;
 829         base = (unsigned long) page_address(p);
 830         while(found != 0) {
 831                 /* Failure. */
 832                 if(p >= (mem_map + max_mapnr))
 833                         return 0UL;
 834                 if(PageReserved(p)) {
 835                         found = size;
 836                         base = (unsigned long) page_address(p);
 837                 } else {
 838                         found -= PAGE_SIZE;
 839                 }
 840                 p++;
 841         }
 842         return base;
 843 }
 844
 845 cycles_t cacheflush_time;
 846
 847 static void __init smp_tune_scheduling (void)
 848 {
 849         unsigned long orig_flush_base, flush_base, flags, *p;
 850         unsigned int ecache_size, order;
 851         cycles_t tick1, tick2, raw;
 852
 853         /* Approximate heuristic for SMP scheduling.  It is an
 854          * estimation of the time it takes to flush the L2 cache
 855          * on the local processor.
 856          *
 857          * The ia32 chooses to use the L1 cache flush time instead,
 858          * and I consider this complete nonsense.  The Ultra can service
 859          * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
 860          * L2 misses are what create extra bus traffic (ie. the "cost"
 861          * of moving a process from one cpu to another).
 862          */
 863         printk("SMP: Calibrating ecache flush... ");
 864         ecache_size = prom_getintdefault(linux_cpus[0].prom_node,
 865                                          "ecache-size", (512 * 1024));
 866         if (ecache_size > (4 * 1024 * 1024))
 867                 ecache_size = (4 * 1024 * 1024);
 868         orig_flush_base = flush_base =
 869                 __get_free_pages(GFP_KERNEL, order = get_order(ecache_size));
 870
 871         if (flush_base != 0UL) {
 872                 __save_and_cli(flags);
 873
 874                 /* Scan twice the size once just to get the TLB entries
 875                  * loaded and make sure the second scan measures pure misses.
 876                  */
 877                 for (p = (unsigned long *)flush_base;
 878                      ((unsigned long)p) < (flush_base + (ecache_size<<1));
 879                      p += (64 / sizeof(unsigned long)))
 880                         *((volatile unsigned long *)p);
 881
 882                 /* Now the real measurement. */
 883                 __asm__ __volatile__("
 884                 b,pt    %%xcc, 1f
 885                  rd     %%tick, %0
 886
 887                 .align  64
 888 1:              ldx     [%2 + 0x000], %%g1
 889                 ldx     [%2 + 0x040], %%g2
 890                 ldx     [%2 + 0x080], %%g3
 891                 ldx     [%2 + 0x0c0], %%g5
 892                 add     %2, 0x100, %2
 893                 cmp     %2, %4
 894                 bne,pt  %%xcc, 1b
 895                  nop
 896
 897                 rd      %%tick, %1"
 898                 : "=&r" (tick1), "=&r" (tick2), "=&r" (flush_base)
 899                 : "2" (flush_base), "r" (flush_base + ecache_size)
 900                 : "g1", "g2", "g3", "g5");
 901
 902                 __restore_flags(flags);
 903
 904                 raw = (tick2 - tick1);
 905
 906                 /* Dampen it a little, considering two processes
 907                  * sharing the cache and fitting.
 908                  */
 909                 cacheflush_time = (raw - (raw >> 2));
 910
 911                 free_pages(orig_flush_base, order);
 912         } else {
 913                 cacheflush_time = ((ecache_size << 2) +
 914                                    (ecache_size << 1));
 915         }
 916
 917         printk("Using heuristic of %d cycles.\n",
 918                (int) cacheflush_time);
 919 }
 920
 921 /* /proc/profile writes can call this, don't __init it please. */
 922 int setup_profiling_timer(unsigned int multiplier)
 923 {
 924         unsigned long flags;
 925         int i;
 926
 927         if((!multiplier) || (timer_tick_offset / multiplier) < 1000)
 928                 return -EINVAL;
 929
 930         save_and_cli(flags);
 931         for(i = 0; i < NR_CPUS; i++) {
 932                 if(cpu_present_map & (1UL << i))
 933                         prof_multiplier(i) = multiplier;
 934         }
 935         current_tick_offset = (timer_tick_offset / multiplier);
 936         restore_flags(flags);
 937
 938         return 0;
 939 }