drivers/misc/sgi-gru/grukservices.c

   1 /*
   2  * SN Platform GRU Driver
   3  *
   4  *              KERNEL SERVICES THAT USE THE GRU
   5  *
   6  *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
   7  *
   8  *  This program is free software; you can redistribute it and/or modify
   9  *  it under the terms of the GNU General Public License as published by
  10  *  the Free Software Foundation; either version 2 of the License, or
  11  *  (at your option) any later version.
  12  *
  13  *  This program is distributed in the hope that it will be useful,
  14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  *  GNU General Public License for more details.
  17  *
  18  *  You should have received a copy of the GNU General Public License
  19  *  along with this program; if not, write to the Free Software
  20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  21  */
  22
  23 #include <linux/kernel.h>
  24 #include <linux/errno.h>
  25 #include <linux/slab.h>
  26 #include <linux/mm.h>
  27 #include <linux/smp_lock.h>
  28 #include <linux/spinlock.h>
  29 #include <linux/device.h>
  30 #include <linux/miscdevice.h>
  31 #include <linux/proc_fs.h>
  32 #include <linux/interrupt.h>
  33 #include <linux/uaccess.h>
  34 #include <linux/delay.h>
  35 #include "gru.h"
  36 #include "grulib.h"
  37 #include "grutables.h"
  38 #include "grukservices.h"
  39 #include "gru_instructions.h"
  40 #include <asm/uv/uv_hub.h>
  41
  42 /*
  43  * Kernel GRU Usage
  44  *
  45  * The following is an interim algorithm for management of kernel GRU
  46  * resources. This will likely be replaced when we better understand the
  47  * kernel/user requirements.
  48  *
  49  * Blade percpu resources reserved for kernel use. These resources are
  50  * reserved whenever the the kernel context for the blade is loaded. Note
  51  * that the kernel context is not guaranteed to be always available. It is
  52  * loaded on demand & can be stolen by a user if the user demand exceeds the
  53  * kernel demand. The kernel can always reload the kernel context but
  54  * a SLEEP may be required!!!.
  55  *
  56  * Async Overview:
  57  *
  58  *      Each blade has one "kernel context" that owns GRU kernel resources
  59  *      located on the blade. Kernel drivers use GRU resources in this context
  60  *      for sending messages, zeroing memory, etc.
  61  *
  62  *      The kernel context is dynamically loaded on demand. If it is not in
  63  *      use by the kernel, the kernel context can be unloaded & given to a user.
  64  *      The kernel context will be reloaded when needed. This may require that
  65  *      a context be stolen from a user.
  66  *              NOTE: frequent unloading/reloading of the kernel context is
  67  *              expensive. We are depending on batch schedulers, cpusets, sane
  68  *              drivers or some other mechanism to prevent the need for frequent
  69  *              stealing/reloading.
  70  *
  71  *      The kernel context consists of two parts:
  72  *              - 1 CB & a few DSRs that are reserved for each cpu on the blade.
  73  *                Each cpu has it's own private resources & does not share them
  74  *                with other cpus. These resources are used serially, ie,
  75  *                locked, used & unlocked  on each call to a function in
  76  *                grukservices.
  77  *                      (Now that we have dynamic loading of kernel contexts, I
  78  *                       may rethink this & allow sharing between cpus....)
  79  *
  80  *              - Additional resources can be reserved long term & used directly
  81  *                by UV drivers located in the kernel. Drivers using these GRU
  82  *                resources can use asynchronous GRU instructions that send
  83  *                interrupts on completion.
  84  *                      - these resources must be explicitly locked/unlocked
  85  *                      - locked resources prevent (obviously) the kernel
  86  *                        context from being unloaded.
  87  *                      - drivers using these resource directly issue their own
  88  *                        GRU instruction and must wait/check completion.
  89  *
  90  *                When these resources are reserved, the caller can optionally
  91  *                associate a wait_queue with the resources and use asynchronous
  92  *                GRU instructions. When an async GRU instruction completes, the
  93  *                driver will do a wakeup on the event.
  94  *
  95  */
  96
  97
  98 #define ASYNC_HAN_TO_BID(h)     ((h) - 1)
  99 #define ASYNC_BID_TO_HAN(b)     ((b) + 1)
 100 #define ASYNC_HAN_TO_BS(h)      gru_base[ASYNC_HAN_TO_BID(h)]
 101 #define KCB_TO_GID(cb)          ((cb - gru_start_vaddr) /               \
 102                                         (GRU_SIZE * GRU_CHIPLETS_PER_BLADE))
 103 #define KCB_TO_BS(cb)           gru_base[KCB_TO_GID(cb)]
 104
 105 #define GRU_NUM_KERNEL_CBR      1
 106 #define GRU_NUM_KERNEL_DSR_BYTES 256
 107 #define GRU_NUM_KERNEL_DSR_CL   (GRU_NUM_KERNEL_DSR_BYTES /             \
 108                                         GRU_CACHE_LINE_BYTES)
 109
 110 /* GRU instruction attributes for all instructions */
 111 #define IMA                     IMA_CB_DELAY
 112
 113 /* GRU cacheline size is always 64 bytes - even on arches with 128 byte lines */
 114 #define __gru_cacheline_aligned__                               \
 115         __attribute__((__aligned__(GRU_CACHE_LINE_BYTES)))
 116
 117 #define MAGIC   0x1234567887654321UL
 118
 119 /* Default retry count for GRU errors on kernel instructions */
 120 #define EXCEPTION_RETRY_LIMIT   3
 121
 122 /* Status of message queue sections */
 123 #define MQS_EMPTY               0
 124 #define MQS_FULL                1
 125 #define MQS_NOOP                2
 126
 127 /*----------------- RESOURCE MANAGEMENT -------------------------------------*/
 128 /* optimized for x86_64 */
 129 struct message_queue {
 130         union gru_mesqhead      head __gru_cacheline_aligned__; /* CL 0 */
 131         int                     qlines;                         /* DW 1 */
 132         long                    hstatus[2];
 133         void                    *next __gru_cacheline_aligned__;/* CL 1 */
 134         void                    *limit;
 135         void                    *start;
 136         void                    *start2;
 137         char                    data ____cacheline_aligned;     /* CL 2 */
 138 };
 139
 140 /* First word in every message - used by mesq interface */
 141 struct message_header {
 142         char    present;
 143         char    present2;
 144         char    lines;
 145         char    fill;
 146 };
 147
 148 #define HSTATUS(mq, h)  ((mq) + offsetof(struct message_queue, hstatus[h]))
 149
 150 /*
 151  * Reload the blade's kernel context into a GRU chiplet. Called holding
 152  * the bs_kgts_sema for READ. Will steal user contexts if necessary.
 153  */
 154 static void gru_load_kernel_context(struct gru_blade_state *bs, int blade_id)
 155 {
 156         struct gru_state *gru;
 157         struct gru_thread_state *kgts;
 158         void *vaddr;
 159         int ctxnum, ncpus;
 160
 161         up_read(&bs->bs_kgts_sema);
 162         down_write(&bs->bs_kgts_sema);
 163
 164         if (!bs->bs_kgts)
 165                 bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0);
 166         kgts = bs->bs_kgts;
 167
 168         if (!kgts->ts_gru) {
 169                 STAT(load_kernel_context);
 170                 ncpus = uv_blade_nr_possible_cpus(blade_id);
 171                 kgts->ts_cbr_au_count = GRU_CB_COUNT_TO_AU(
 172                         GRU_NUM_KERNEL_CBR * ncpus + bs->bs_async_cbrs);
 173                 kgts->ts_dsr_au_count = GRU_DS_BYTES_TO_AU(
 174                         GRU_NUM_KERNEL_DSR_BYTES * ncpus +
 175                                 bs->bs_async_dsr_bytes);
 176                 while (!gru_assign_gru_context(kgts, blade_id)) {
 177                         msleep(1);
 178                         gru_steal_context(kgts, blade_id);
 179                 }
 180                 gru_load_context(kgts);
 181                 gru = bs->bs_kgts->ts_gru;
 182                 vaddr = gru->gs_gru_base_vaddr;
 183                 ctxnum = kgts->ts_ctxnum;
 184                 bs->kernel_cb = get_gseg_base_address_cb(vaddr, ctxnum, 0);
 185                 bs->kernel_dsr = get_gseg_base_address_ds(vaddr, ctxnum, 0);
 186         }
 187         downgrade_write(&bs->bs_kgts_sema);
 188 }
 189
 190 /*
 191  * Lock & load the kernel context for the specified blade.
 192  */
 193 static struct gru_blade_state *gru_lock_kernel_context(int blade_id)
 194 {
 195         struct gru_blade_state *bs;
 196
 197         STAT(lock_kernel_context);
 198         bs = gru_base[blade_id];
 199
 200         down_read(&bs->bs_kgts_sema);
 201         if (!bs->bs_kgts || !bs->bs_kgts->ts_gru)
 202                 gru_load_kernel_context(bs, blade_id);
 203         return bs;
 204
 205 }
 206
 207 /*
 208  * Unlock the kernel context for the specified blade. Context is not
 209  * unloaded but may be stolen before next use.
 210  */
 211 static void gru_unlock_kernel_context(int blade_id)
 212 {
 213         struct gru_blade_state *bs;
 214
 215         bs = gru_base[blade_id];
 216         up_read(&bs->bs_kgts_sema);
 217         STAT(unlock_kernel_context);
 218 }
 219
 220 /*
 221  * Reserve & get pointers to the DSR/CBRs reserved for the current cpu.
 222  *      - returns with preemption disabled
 223  */
 224 static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
 225 {
 226         struct gru_blade_state *bs;
 227         int lcpu;
 228
 229         BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
 230         preempt_disable();
 231         bs = gru_lock_kernel_context(uv_numa_blade_id());
 232         lcpu = uv_blade_processor_id();
 233         *cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
 234         *dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
 235         return 0;
 236 }
 237
 238 /*
 239  * Free the current cpus reserved DSR/CBR resources.
 240  */
 241 static void gru_free_cpu_resources(void *cb, void *dsr)
 242 {
 243         gru_unlock_kernel_context(uv_numa_blade_id());
 244         preempt_enable();
 245 }
 246
 247 /*
 248  * Reserve GRU resources to be used asynchronously.
 249  *   Note: currently supports only 1 reservation per blade.
 250  *
 251  *      input:
 252  *              blade_id  - blade on which resources should be reserved
 253  *              cbrs      - number of CBRs
 254  *              dsr_bytes - number of DSR bytes needed
 255  *      output:
 256  *              handle to identify resource
 257  *              (0 = async resources already reserved)
 258  */
 259 unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
 260                         struct completion *cmp)
 261 {
 262         struct gru_blade_state *bs;
 263         struct gru_thread_state *kgts;
 264         int ret = 0;
 265
 266         bs = gru_base[blade_id];
 267
 268         down_write(&bs->bs_kgts_sema);
 269
 270         /* Verify no resources already reserved */
 271         if (bs->bs_async_dsr_bytes + bs->bs_async_cbrs)
 272                 goto done;
 273         bs->bs_async_dsr_bytes = dsr_bytes;
 274         bs->bs_async_cbrs = cbrs;
 275         bs->bs_async_wq = cmp;
 276         kgts = bs->bs_kgts;
 277
 278         /* Resources changed. Unload context if already loaded */
 279         if (kgts && kgts->ts_gru)
 280                 gru_unload_context(kgts, 0);
 281         ret = ASYNC_BID_TO_HAN(blade_id);
 282
 283 done:
 284         up_write(&bs->bs_kgts_sema);
 285         return ret;
 286 }
 287
 288 /*
 289  * Release async resources previously reserved.
 290  *
 291  *      input:
 292  *              han - handle to identify resources
 293  */
 294 void gru_release_async_resources(unsigned long han)
 295 {
 296         struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
 297
 298         down_write(&bs->bs_kgts_sema);
 299         bs->bs_async_dsr_bytes = 0;
 300         bs->bs_async_cbrs = 0;
 301         bs->bs_async_wq = NULL;
 302         up_write(&bs->bs_kgts_sema);
 303 }
 304
 305 /*
 306  * Wait for async GRU instructions to complete.
 307  *
 308  *      input:
 309  *              han - handle to identify resources
 310  */
 311 void gru_wait_async_cbr(unsigned long han)
 312 {
 313         struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
 314
 315         wait_for_completion(bs->bs_async_wq);
 316         mb();
 317 }
 318
 319 /*
 320  * Lock previous reserved async GRU resources
 321  *
 322  *      input:
 323  *              han - handle to identify resources
 324  *      output:
 325  *              cb  - pointer to first CBR
 326  *              dsr - pointer to first DSR
 327  */
 328 void gru_lock_async_resource(unsigned long han,  void **cb, void **dsr)
 329 {
 330         struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
 331         int blade_id = ASYNC_HAN_TO_BID(han);
 332         int ncpus;
 333
 334         gru_lock_kernel_context(blade_id);
 335         ncpus = uv_blade_nr_possible_cpus(blade_id);
 336         if (cb)
 337                 *cb = bs->kernel_cb + ncpus * GRU_HANDLE_STRIDE;
 338         if (dsr)
 339                 *dsr = bs->kernel_dsr + ncpus * GRU_NUM_KERNEL_DSR_BYTES;
 340 }
 341
 342 /*
 343  * Unlock previous reserved async GRU resources
 344  *
 345  *      input:
 346  *              han - handle to identify resources
 347  */
 348 void gru_unlock_async_resource(unsigned long han)
 349 {
 350         int blade_id = ASYNC_HAN_TO_BID(han);
 351
 352         gru_unlock_kernel_context(blade_id);
 353 }
 354
 355 /*----------------------------------------------------------------------*/
 356 int gru_get_cb_exception_detail(void *cb,
 357                 struct control_block_extended_exc_detail *excdet)
 358 {
 359         struct gru_control_block_extended *cbe;
 360         struct gru_blade_state *bs;
 361         int cbrnum;
 362
 363         bs = KCB_TO_BS(cb);
 364         cbrnum = thread_cbr_number(bs->bs_kgts, get_cb_number(cb));
 365         cbe = get_cbe(GRUBASE(cb), cbrnum);
 366         gru_flush_cache(cbe);   /* CBE not coherent */
 367         excdet->opc = cbe->opccpy;
 368         excdet->exopc = cbe->exopccpy;
 369         excdet->ecause = cbe->ecause;
 370         excdet->exceptdet0 = cbe->idef1upd;
 371         excdet->exceptdet1 = cbe->idef3upd;
 372         gru_flush_cache(cbe);
 373         return 0;
 374 }
 375
 376 char *gru_get_cb_exception_detail_str(int ret, void *cb,
 377                                       char *buf, int size)
 378 {
 379         struct gru_control_block_status *gen = (void *)cb;
 380         struct control_block_extended_exc_detail excdet;
 381
 382         if (ret > 0 && gen->istatus == CBS_EXCEPTION) {
 383                 gru_get_cb_exception_detail(cb, &excdet);
 384                 snprintf(buf, size,
 385                         "GRU exception: cb %p, opc %d, exopc %d, ecause 0x%x,"
 386                         "excdet0 0x%lx, excdet1 0x%x",
 387                         gen, excdet.opc, excdet.exopc, excdet.ecause,
 388                         excdet.exceptdet0, excdet.exceptdet1);
 389         } else {
 390                 snprintf(buf, size, "No exception");
 391         }
 392         return buf;
 393 }
 394
 395 static int gru_wait_idle_or_exception(struct gru_control_block_status *gen)
 396 {
 397         while (gen->istatus >= CBS_ACTIVE) {
 398                 cpu_relax();
 399                 barrier();
 400         }
 401         return gen->istatus;
 402 }
 403
 404 static int gru_retry_exception(void *cb)
 405 {
 406         struct gru_control_block_status *gen = (void *)cb;
 407         struct control_block_extended_exc_detail excdet;
 408         int retry = EXCEPTION_RETRY_LIMIT;
 409
 410         while (1)  {
 411                 if (gru_get_cb_message_queue_substatus(cb))
 412                         break;
 413                 if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
 414                         return CBS_IDLE;
 415
 416                 gru_get_cb_exception_detail(cb, &excdet);
 417                 if ((excdet.ecause & ~EXCEPTION_RETRY_BITS) ||
 418                                 (excdet.cbrexecstatus & CBR_EXS_ABORT_OCC))
 419                         break;
 420                 if (retry-- == 0)
 421                         break;
 422                 gen->icmd = 1;
 423                 gru_flush_cache(gen);
 424         }
 425         return CBS_EXCEPTION;
 426 }
 427
 428 int gru_check_status_proc(void *cb)
 429 {
 430         struct gru_control_block_status *gen = (void *)cb;
 431         int ret;
 432
 433         ret = gen->istatus;
 434         if (ret != CBS_EXCEPTION)
 435                 return ret;
 436         return gru_retry_exception(cb);
 437
 438 }
 439
 440 int gru_wait_proc(void *cb)
 441 {
 442         struct gru_control_block_status *gen = (void *)cb;
 443         int ret;
 444
 445         ret = gru_wait_idle_or_exception(gen);
 446         if (ret == CBS_EXCEPTION)
 447                 ret = gru_retry_exception(cb);
 448
 449         return ret;
 450 }
 451
 452 void gru_abort(int ret, void *cb, char *str)
 453 {
 454         char buf[GRU_EXC_STR_SIZE];
 455
 456         panic("GRU FATAL ERROR: %s - %s\n", str,
 457               gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf)));
 458 }
 459
 460 void gru_wait_abort_proc(void *cb)
 461 {
 462         int ret;
 463
 464         ret = gru_wait_proc(cb);
 465         if (ret)
 466                 gru_abort(ret, cb, "gru_wait_abort");
 467 }
 468
 469
 470 /*------------------------------ MESSAGE QUEUES -----------------------------*/
 471
 472 /* Internal status . These are NOT returned to the user. */
 473 #define MQIE_AGAIN              -1      /* try again */
 474
 475
 476 /*
 477  * Save/restore the "present" flag that is in the second line of 2-line
 478  * messages
 479  */
 480 static inline int get_present2(void *p)
 481 {
 482         struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
 483         return mhdr->present;
 484 }
 485
 486 static inline void restore_present2(void *p, int val)
 487 {
 488         struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
 489         mhdr->present = val;
 490 }
 491
 492 /*
 493  * Create a message queue.
 494  *      qlines - message queue size in cache lines. Includes 2-line header.
 495  */
 496 int gru_create_message_queue(struct gru_message_queue_desc *mqd,
 497                 void *p, unsigned int bytes, int nasid, int vector, int apicid)
 498 {
 499         struct message_queue *mq = p;
 500         unsigned int qlines;
 501
 502         qlines = bytes / GRU_CACHE_LINE_BYTES - 2;
 503         memset(mq, 0, bytes);
 504         mq->start = &mq->data;
 505         mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES;
 506         mq->next = &mq->data;
 507         mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES;
 508         mq->qlines = qlines;
 509         mq->hstatus[0] = 0;
 510         mq->hstatus[1] = 1;
 511         mq->head = gru_mesq_head(2, qlines / 2 + 1);
 512         mqd->mq = mq;
 513         mqd->mq_gpa = uv_gpa(mq);
 514         mqd->qlines = qlines;
 515         mqd->interrupt_pnode = UV_NASID_TO_PNODE(nasid);
 516         mqd->interrupt_vector = vector;
 517         mqd->interrupt_apicid = apicid;
 518         return 0;
 519 }
 520 EXPORT_SYMBOL_GPL(gru_create_message_queue);
 521
 522 /*
 523  * Send a NOOP message to a message queue
 524  *      Returns:
 525  *               0 - if queue is full after the send. This is the normal case
 526  *                   but various races can change this.
 527  *              -1 - if mesq sent successfully but queue not full
 528  *              >0 - unexpected error. MQE_xxx returned
 529  */
 530 static int send_noop_message(void *cb, struct gru_message_queue_desc *mqd,
 531                                 void *mesg)
 532 {
 533         const struct message_header noop_header = {
 534                                         .present = MQS_NOOP, .lines = 1};
 535         unsigned long m;
 536         int substatus, ret;
 537         struct message_header save_mhdr, *mhdr = mesg;
 538
 539         STAT(mesq_noop);
 540         save_mhdr = *mhdr;
 541         *mhdr = noop_header;
 542         gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), 1, IMA);
 543         ret = gru_wait(cb);
 544
 545         if (ret) {
 546                 substatus = gru_get_cb_message_queue_substatus(cb);
 547                 switch (substatus) {
 548                 case CBSS_NO_ERROR:
 549                         STAT(mesq_noop_unexpected_error);
 550                         ret = MQE_UNEXPECTED_CB_ERR;
 551                         break;
 552                 case CBSS_LB_OVERFLOWED:
 553                         STAT(mesq_noop_lb_overflow);
 554                         ret = MQE_CONGESTION;
 555                         break;
 556                 case CBSS_QLIMIT_REACHED:
 557                         STAT(mesq_noop_qlimit_reached);
 558                         ret = 0;
 559                         break;
 560                 case CBSS_AMO_NACKED:
 561                         STAT(mesq_noop_amo_nacked);
 562                         ret = MQE_CONGESTION;
 563                         break;
 564                 case CBSS_PUT_NACKED:
 565                         STAT(mesq_noop_put_nacked);
 566                         m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
 567                         gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
 568                                                 IMA);
 569                         if (gru_wait(cb) == CBS_IDLE)
 570                                 ret = MQIE_AGAIN;
 571                         else
 572                                 ret = MQE_UNEXPECTED_CB_ERR;
 573                         break;
 574                 case CBSS_PAGE_OVERFLOW:
 575                 default:
 576                         BUG();
 577                 }
 578         }
 579         *mhdr = save_mhdr;
 580         return ret;
 581 }
 582
 583 /*
 584  * Handle a gru_mesq full.
 585  */
 586 static int send_message_queue_full(void *cb, struct gru_message_queue_desc *mqd,
 587                                 void *mesg, int lines)
 588 {
 589         union gru_mesqhead mqh;
 590         unsigned int limit, head;
 591         unsigned long avalue;
 592         int half, qlines;
 593
 594         /* Determine if switching to first/second half of q */
 595         avalue = gru_get_amo_value(cb);
 596         head = gru_get_amo_value_head(cb);
 597         limit = gru_get_amo_value_limit(cb);
 598
 599         qlines = mqd->qlines;
 600         half = (limit != qlines);
 601
 602         if (half)
 603                 mqh = gru_mesq_head(qlines / 2 + 1, qlines);
 604         else
 605                 mqh = gru_mesq_head(2, qlines / 2 + 1);
 606
 607         /* Try to get lock for switching head pointer */
 608         gru_gamir(cb, EOP_IR_CLR, HSTATUS(mqd->mq_gpa, half), XTYPE_DW, IMA);
 609         if (gru_wait(cb) != CBS_IDLE)
 610                 goto cberr;
 611         if (!gru_get_amo_value(cb)) {
 612                 STAT(mesq_qf_locked);
 613                 return MQE_QUEUE_FULL;
 614         }
 615
 616         /* Got the lock. Send optional NOP if queue not full, */
 617         if (head != limit) {
 618                 if (send_noop_message(cb, mqd, mesg)) {
 619                         gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half),
 620                                         XTYPE_DW, IMA);
 621                         if (gru_wait(cb) != CBS_IDLE)
 622                                 goto cberr;
 623                         STAT(mesq_qf_noop_not_full);
 624                         return MQIE_AGAIN;
 625                 }
 626                 avalue++;
 627         }
 628
 629         /* Then flip queuehead to other half of queue. */
 630         gru_gamer(cb, EOP_ERR_CSWAP, mqd->mq_gpa, XTYPE_DW, mqh.val, avalue,
 631                                                         IMA);
 632         if (gru_wait(cb) != CBS_IDLE)
 633                 goto cberr;
 634
 635         /* If not successfully in swapping queue head, clear the hstatus lock */
 636         if (gru_get_amo_value(cb) != avalue) {
 637                 STAT(mesq_qf_switch_head_failed);
 638                 gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half), XTYPE_DW,
 639                                                         IMA);
 640                 if (gru_wait(cb) != CBS_IDLE)
 641                         goto cberr;
 642         }
 643         return MQIE_AGAIN;
 644 cberr:
 645         STAT(mesq_qf_unexpected_error);
 646         return MQE_UNEXPECTED_CB_ERR;
 647 }
 648
 649 /*
 650  * Send a cross-partition interrupt to the SSI that contains the target
 651  * message queue. Normally, the interrupt is automatically delivered by hardware
 652  * but some error conditions require explicit delivery.
 653  */
 654 static void send_message_queue_interrupt(struct gru_message_queue_desc *mqd)
 655 {
 656         if (mqd->interrupt_vector)
 657                 uv_hub_send_ipi(mqd->interrupt_pnode, mqd->interrupt_apicid,
 658                                 mqd->interrupt_vector);
 659 }
 660
 661 /*
 662  * Handle a PUT failure. Note: if message was a 2-line message, one of the
 663  * lines might have successfully have been written. Before sending the
 664  * message, "present" must be cleared in BOTH lines to prevent the receiver
 665  * from prematurely seeing the full message.
 666  */
 667 static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
 668                         void *mesg, int lines)
 669 {
 670         unsigned long m;
 671
 672         m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
 673         if (lines == 2) {
 674                 gru_vset(cb, m, 0, XTYPE_CL, lines, 1, IMA);
 675                 if (gru_wait(cb) != CBS_IDLE)
 676                         return MQE_UNEXPECTED_CB_ERR;
 677         }
 678         gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
 679         if (gru_wait(cb) != CBS_IDLE)
 680                 return MQE_UNEXPECTED_CB_ERR;
 681         send_message_queue_interrupt(mqd);
 682         return MQE_OK;
 683 }
 684
 685 /*
 686  * Handle a gru_mesq failure. Some of these failures are software recoverable
 687  * or retryable.
 688  */
 689 static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
 690                                 void *mesg, int lines)
 691 {
 692         int substatus, ret = 0;
 693
 694         substatus = gru_get_cb_message_queue_substatus(cb);
 695         switch (substatus) {
 696         case CBSS_NO_ERROR:
 697                 STAT(mesq_send_unexpected_error);
 698                 ret = MQE_UNEXPECTED_CB_ERR;
 699                 break;
 700         case CBSS_LB_OVERFLOWED:
 701                 STAT(mesq_send_lb_overflow);
 702                 ret = MQE_CONGESTION;
 703                 break;
 704         case CBSS_QLIMIT_REACHED:
 705                 STAT(mesq_send_qlimit_reached);
 706                 ret = send_message_queue_full(cb, mqd, mesg, lines);
 707                 break;
 708         case CBSS_AMO_NACKED:
 709                 STAT(mesq_send_amo_nacked);
 710                 ret = MQE_CONGESTION;
 711                 break;
 712         case CBSS_PUT_NACKED:
 713                 STAT(mesq_send_put_nacked);
 714                 ret = send_message_put_nacked(cb, mqd, mesg, lines);
 715                 break;
 716         default:
 717                 BUG();
 718         }
 719         return ret;
 720 }
 721
 722 /*
 723  * Send a message to a message queue
 724  *      mqd     message queue descriptor
 725  *      mesg    message. ust be vaddr within a GSEG
 726  *      bytes   message size (<= 2 CL)
 727  */
 728 int gru_send_message_gpa(struct gru_message_queue_desc *mqd, void *mesg,
 729                                 unsigned int bytes)
 730 {
 731         struct message_header *mhdr;
 732         void *cb;
 733         void *dsr;
 734         int istatus, clines, ret;
 735
 736         STAT(mesq_send);
 737         BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES);
 738
 739         clines = DIV_ROUND_UP(bytes, GRU_CACHE_LINE_BYTES);
 740         if (gru_get_cpu_resources(bytes, &cb, &dsr))
 741                 return MQE_BUG_NO_RESOURCES;
 742         memcpy(dsr, mesg, bytes);
 743         mhdr = dsr;
 744         mhdr->present = MQS_FULL;
 745         mhdr->lines = clines;
 746         if (clines == 2) {
 747                 mhdr->present2 = get_present2(mhdr);
 748                 restore_present2(mhdr, MQS_FULL);
 749         }
 750
 751         do {
 752                 ret = MQE_OK;
 753                 gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), clines, IMA);
 754                 istatus = gru_wait(cb);
 755                 if (istatus != CBS_IDLE)
 756                         ret = send_message_failure(cb, mqd, dsr, clines);
 757         } while (ret == MQIE_AGAIN);
 758         gru_free_cpu_resources(cb, dsr);
 759
 760         if (ret)
 761                 STAT(mesq_send_failed);
 762         return ret;
 763 }
 764 EXPORT_SYMBOL_GPL(gru_send_message_gpa);
 765
 766 /*
 767  * Advance the receive pointer for the queue to the next message.
 768  */
 769 void gru_free_message(struct gru_message_queue_desc *mqd, void *mesg)
 770 {
 771         struct message_queue *mq = mqd->mq;
 772         struct message_header *mhdr = mq->next;
 773         void *next, *pnext;
 774         int half = -1;
 775         int lines = mhdr->lines;
 776
 777         if (lines == 2)
 778                 restore_present2(mhdr, MQS_EMPTY);
 779         mhdr->present = MQS_EMPTY;
 780
 781         pnext = mq->next;
 782         next = pnext + GRU_CACHE_LINE_BYTES * lines;
 783         if (next == mq->limit) {
 784                 next = mq->start;
 785                 half = 1;
 786         } else if (pnext < mq->start2 && next >= mq->start2) {
 787                 half = 0;
 788         }
 789
 790         if (half >= 0)
 791                 mq->hstatus[half] = 1;
 792         mq->next = next;
 793 }
 794 EXPORT_SYMBOL_GPL(gru_free_message);
 795
 796 /*
 797  * Get next message from message queue. Return NULL if no message
 798  * present. User must call next_message() to move to next message.
 799  *      rmq     message queue
 800  */
 801 void *gru_get_next_message(struct gru_message_queue_desc *mqd)
 802 {
 803         struct message_queue *mq = mqd->mq;
 804         struct message_header *mhdr = mq->next;
 805         int present = mhdr->present;
 806
 807         /* skip NOOP messages */
 808         STAT(mesq_receive);
 809         while (present == MQS_NOOP) {
 810                 gru_free_message(mqd, mhdr);
 811                 mhdr = mq->next;
 812                 present = mhdr->present;
 813         }
 814
 815         /* Wait for both halves of 2 line messages */
 816         if (present == MQS_FULL && mhdr->lines == 2 &&
 817                                 get_present2(mhdr) == MQS_EMPTY)
 818                 present = MQS_EMPTY;
 819
 820         if (!present) {
 821                 STAT(mesq_receive_none);
 822                 return NULL;
 823         }
 824
 825         if (mhdr->lines == 2)
 826                 restore_present2(mhdr, mhdr->present2);
 827
 828         return mhdr;
 829 }
 830 EXPORT_SYMBOL_GPL(gru_get_next_message);
 831
 832 /* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/
 833
 834 /*
 835  * Copy a block of data using the GRU resources
 836  */
 837 int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
 838                                 unsigned int bytes)
 839 {
 840         void *cb;
 841         void *dsr;
 842         int ret;
 843
 844         STAT(copy_gpa);
 845         if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
 846                 return MQE_BUG_NO_RESOURCES;
 847         gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr),
 848                   XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_CL, IMA);
 849         ret = gru_wait(cb);
 850         gru_free_cpu_resources(cb, dsr);
 851         return ret;
 852 }
 853 EXPORT_SYMBOL_GPL(gru_copy_gpa);
 854
 855 /* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
 856 /*      Temp - will delete after we gain confidence in the GRU          */
 857
 858 static int quicktest0(unsigned long arg)
 859 {
 860         unsigned long word0;
 861         unsigned long word1;
 862         void *cb;
 863         void *dsr;
 864         unsigned long *p;
 865         int ret = -EIO;
 866
 867         if (gru_get_cpu_resources(GRU_CACHE_LINE_BYTES, &cb, &dsr))
 868                 return MQE_BUG_NO_RESOURCES;
 869         p = dsr;
 870         word0 = MAGIC;
 871         word1 = 0;
 872
 873         gru_vload(cb, uv_gpa(&word0), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
 874         if (gru_wait(cb) != CBS_IDLE) {
 875                 printk(KERN_DEBUG "GRU quicktest0: CBR failure 1\n");
 876                 goto done;
 877         }
 878
 879         if (*p != MAGIC) {
 880                 printk(KERN_DEBUG "GRU: quicktest0 bad magic 0x%lx\n", *p);
 881                 goto done;
 882         }
 883         gru_vstore(cb, uv_gpa(&word1), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
 884         if (gru_wait(cb) != CBS_IDLE) {
 885                 printk(KERN_DEBUG "GRU quicktest0: CBR failure 2\n");
 886                 goto done;
 887         }
 888
 889         if (word0 != word1 || word1 != MAGIC) {
 890                 printk(KERN_DEBUG
 891                        "GRU quicktest0 err: found 0x%lx, expected 0x%lx\n",
 892                      word1, MAGIC);
 893                 goto done;
 894         }
 895         ret = 0;
 896
 897 done:
 898         gru_free_cpu_resources(cb, dsr);
 899         return ret;
 900 }
 901
 902 #define ALIGNUP(p, q)   ((void *)(((unsigned long)(p) + (q) - 1) & ~(q - 1)))
 903
 904 static int quicktest1(unsigned long arg)
 905 {
 906         struct gru_message_queue_desc mqd;
 907         void *p, *mq;
 908         unsigned long *dw;
 909         int i, ret = -EIO;
 910         char mes[GRU_CACHE_LINE_BYTES], *m;
 911
 912         /* Need  1K cacheline aligned that does not cross page boundary */
 913         p = kmalloc(4096, 0);
 914         mq = ALIGNUP(p, 1024);
 915         memset(mes, 0xee, sizeof(mes));
 916         dw = mq;
 917
 918         gru_create_message_queue(&mqd, mq, 8 * GRU_CACHE_LINE_BYTES, 0, 0, 0);
 919         for (i = 0; i < 6; i++) {
 920                 mes[8] = i;
 921                 do {
 922                         ret = gru_send_message_gpa(&mqd, mes, sizeof(mes));
 923                 } while (ret == MQE_CONGESTION);
 924                 if (ret)
 925                         break;
 926         }
 927         if (ret != MQE_QUEUE_FULL || i != 4)
 928                 goto done;
 929
 930         for (i = 0; i < 6; i++) {
 931                 m = gru_get_next_message(&mqd);
 932                 if (!m || m[8] != i)
 933                         break;
 934                 gru_free_message(&mqd, m);
 935         }
 936         ret = (i == 4) ? 0 : -EIO;
 937
 938 done:
 939         kfree(p);
 940         return ret;
 941 }
 942
 943 static int quicktest2(unsigned long arg)
 944 {
 945         static DECLARE_COMPLETION(cmp);
 946         unsigned long han;
 947         int blade_id = 0;
 948         int numcb = 4;
 949         int ret = 0;
 950         unsigned long *buf;
 951         void *cb0, *cb;
 952         int i, k, istatus, bytes;
 953
 954         bytes = numcb * 4 * 8;
 955         buf = kmalloc(bytes, GFP_KERNEL);
 956         if (!buf)
 957                 return -ENOMEM;
 958
 959         ret = -EBUSY;
 960         han = gru_reserve_async_resources(blade_id, numcb, 0, &cmp);
 961         if (!han)
 962                 goto done;
 963
 964         gru_lock_async_resource(han, &cb0, NULL);
 965         memset(buf, 0xee, bytes);
 966         for (i = 0; i < numcb; i++)
 967                 gru_vset(cb0 + i * GRU_HANDLE_STRIDE, uv_gpa(&buf[i * 4]), 0,
 968                                 XTYPE_DW, 4, 1, IMA_INTERRUPT);
 969
 970         ret = 0;
 971         for (k = 0; k < numcb; k++) {
 972                 gru_wait_async_cbr(han);
 973                 for (i = 0; i < numcb; i++) {
 974                         cb = cb0 + i * GRU_HANDLE_STRIDE;
 975                         istatus = gru_check_status(cb);
 976                         if (istatus == CBS_ACTIVE)
 977                                 continue;
 978                         if (istatus == CBS_EXCEPTION)
 979                                 ret = -EFAULT;
 980                         else if (buf[i] || buf[i + 1] || buf[i + 2] ||
 981                                         buf[i + 3])
 982                                 ret = -EIO;
 983                 }
 984         }
 985         BUG_ON(cmp.done);
 986
 987         gru_unlock_async_resource(han);
 988         gru_release_async_resources(han);
 989 done:
 990         kfree(buf);
 991         return ret;
 992 }
 993
 994 /*
 995  * Debugging only. User hook for various kernel tests
 996  * of driver & gru.
 997  */
 998 int gru_ktest(unsigned long arg)
 999 {
1000         int ret = -EINVAL;
1001
1002         switch (arg & 0xff) {
1003         case 0:
1004                 ret = quicktest0(arg);
1005                 break;
1006         case 1:
1007                 ret = quicktest1(arg);
1008                 break;
1009         case 2:
1010                 ret = quicktest2(arg);
1011                 break;
1012         }
1013         return ret;
1014
1015 }
1016
1017 int gru_kservices_init(struct gru_state *gru)
1018 {
1019         struct gru_blade_state *bs;
1020
1021         bs = gru->gs_blade;
1022         if (gru != &bs->bs_grus[0])
1023                 return 0;
1024
1025         init_rwsem(&bs->bs_kgts_sema);
1026         return 0;
1027 }
1028
1029 void gru_kservices_exit(struct gru_state *gru)
1030 {
1031         struct gru_blade_state *bs;
1032         struct gru_thread_state *kgts;
1033
1034         bs = gru->gs_blade;
1035         if (gru != &bs->bs_grus[0])
1036                 return;
1037
1038         kgts = bs->bs_kgts;
1039         if (kgts && kgts->ts_gru)
1040                 gru_unload_context(kgts, 0);
1041         kfree(kgts);
1042 }
1043