src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # include <afs/opr.h>
  47 # if defined(AFS_NT40_ENV)
  48 #  ifndef EWOULDBLOCK
  49 #   define EWOULDBLOCK WSAEWOULDBLOCK
  50 #  endif
  51 #  include "rx_user.h"
  52 #  include "rx_xmit_nt.h"
  53 # endif
  54 # include <lwp.h>
  55 #endif /* KERNEL */
  56
  57 #ifdef  AFS_SUN5_ENV
  58 # include <sys/sysmacros.h>
  59 #endif
  60
  61 #include <opr/queue.h>
  62
  63 #include "rx.h"
  64 #include "rx_clock.h"
  65 #include "rx_packet.h"
  66 #include "rx_atomic.h"
  67 #include "rx_globals.h"
  68 #include "rx_internal.h"
  69 #include "rx_stats.h"
  70
  71 #include "rx_peer.h"
  72 #include "rx_conn.h"
  73 #include "rx_call.h"
  74
  75 /*!
  76  * \brief structure used to keep track of allocated packets
  77  */
  78 struct rx_mallocedPacket {
  79     struct opr_queue entry;     /*!< chained using opr_queue */
  80     struct rx_packet *addr;     /*!< address of the first element */
  81     afs_uint32 size;            /*!< array size in bytes */
  82 };
  83
  84 #ifdef RX_LOCKS_DB
  85 /* rxdb_fileID is used to identify the lock location, along with line#. */
  86 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  87 #endif /* RX_LOCKS_DB */
  88 static struct rx_packet *rx_mallocedP = 0;
  89 #ifdef RXDEBUG_PACKET
  90 static afs_uint32       rx_packet_id = 0;
  91 #endif
  92
  93 extern char cml_version_number[];
  94
  95 static int AllocPacketBufs(int class, int num_pkts, struct opr_queue *q);
  96
  97 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  98                                 afs_uint32 ahost, short aport,
  99                                 afs_int32 istack);
 100 static struct rx_packet *rxi_AllocPacketNoLock(int class);
 101
 102 #ifndef KERNEL
 103 static void rxi_MorePacketsNoLock(int apackets);
 104 #endif
 105
 106 #ifdef RX_ENABLE_TSFPQ
 107 static int rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first,
 108                                  int flush_global);
 109 static void rxi_AdjustLocalPacketsTSFPQ(int num_keep_local,
 110                                         int allow_overcommit);
 111 #else
 112 static void rxi_FreePacketNoLock(struct rx_packet *p);
 113 static int rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first);
 114 static int rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first,
 115                                    struct opr_queue * q);
 116 #endif
 117
 118 extern struct opr_queue rx_idleServerQueue;
 119
 120 /* some rules about packets:
 121  * 1.  When a packet is allocated, the final iov_buf contains room for
 122  * a security trailer, but iov_len masks that fact.  If the security
 123  * package wants to add the trailer, it may do so, and then extend
 124  * iov_len appropriately.  For this reason, packet's niovecs and
 125  * iov_len fields should be accurate before calling PreparePacket.
 126 */
 127
 128 /* Preconditions:
 129  *        all packet buffers (iov_base) are integral multiples of
 130  *        the word size.
 131  *        offset is an integral multiple of the word size.
 132  */
 133 afs_int32
 134 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 135 {
 136     unsigned int i;
 137     size_t l;
 138     for (l = 0, i = 1; i < packet->niovecs; i++) {
 139         if (l + packet->wirevec[i].iov_len > offset) {
 140             return
 141                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 142                                  (offset - l)));
 143         }
 144         l += packet->wirevec[i].iov_len;
 145     }
 146
 147     return 0;
 148 }
 149
 150 /* Preconditions:
 151  *        all packet buffers (iov_base) are integral multiples of the word size.
 152  *        offset is an integral multiple of the word size.
 153  */
 154 afs_int32
 155 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 156 {
 157     unsigned int i;
 158     size_t l;
 159     for (l = 0, i = 1; i < packet->niovecs; i++) {
 160         if (l + packet->wirevec[i].iov_len > offset) {
 161             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 162                              (offset - l))) = data;
 163             return 0;
 164         }
 165         l += packet->wirevec[i].iov_len;
 166     }
 167
 168     return 0;
 169 }
 170
 171 /* Preconditions:
 172  *        all packet buffers (iov_base) are integral multiples of the
 173  *        word size.
 174  *        offset is an integral multiple of the word size.
 175  * Packet Invariants:
 176  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 177  */
 178 afs_int32
 179 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 180                   char *out)
 181 {
 182     unsigned int i, j, l, r;
 183     for (l = 0, i = 1; i < packet->niovecs; i++) {
 184         if (l + packet->wirevec[i].iov_len > offset) {
 185             break;
 186         }
 187         l += packet->wirevec[i].iov_len;
 188     }
 189
 190     /* i is the iovec which contains the first little bit of data in which we
 191      * are interested.  l is the total length of everything prior to this iovec.
 192      * j is the number of bytes we can safely copy out of this iovec.
 193      * offset only applies to the first iovec.
 194      */
 195     r = resid;
 196     while ((r > 0) && (i < packet->niovecs)) {
 197         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 198         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 199         r -= j;
 200         out += j;
 201         l += packet->wirevec[i].iov_len;
 202         offset = l;
 203         i++;
 204     }
 205
 206     return (r ? (resid - r) : resid);
 207 }
 208
 209
 210 /* Preconditions:
 211  *        all packet buffers (iov_base) are integral multiples of the
 212  *        word size.
 213  *        offset is an integral multiple of the word size.
 214  */
 215 afs_int32
 216 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 217 {
 218     unsigned int i, j, l, o, r;
 219     char *b;
 220
 221     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 222         if (l + packet->wirevec[i].iov_len > o) {
 223             break;
 224         }
 225         l += packet->wirevec[i].iov_len;
 226     }
 227
 228     /* i is the iovec which contains the first little bit of data in which we
 229      * are interested.  l is the total length of everything prior to this iovec.
 230      * j is the number of bytes we can safely copy out of this iovec.
 231      * offset only applies to the first iovec.
 232      */
 233     r = resid;
 234     while ((r > 0) && (i <= RX_MAXWVECS)) {
 235         if (i >= packet->niovecs)
 236             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 237                 break;
 238
 239         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 240         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 241         memcpy(b, in, j);
 242         r -= j;
 243         in += j;
 244         l += packet->wirevec[i].iov_len;
 245         offset = l;
 246         i++;
 247     }
 248
 249     return (r ? (resid - r) : resid);
 250 }
 251
 252 int
 253 rxi_AllocPackets(int class, int num_pkts, struct opr_queue * q)
 254 {
 255     struct opr_queue *c;
 256
 257     num_pkts = AllocPacketBufs(class, num_pkts, q);
 258
 259     for (opr_queue_Scan(q, c)) {
 260         RX_PACKET_IOV_FULLINIT(opr_queue_Entry(c, struct rx_packet, entry));
 261     }
 262
 263     return num_pkts;
 264 }
 265
 266 #ifdef RX_ENABLE_TSFPQ
 267 static int
 268 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 269 {
 270     struct rx_ts_info_t * rx_ts_info;
 271     int transfer;
 272     SPLVAR;
 273
 274     RX_TS_INFO_GET(rx_ts_info);
 275
 276     transfer = num_pkts - rx_ts_info->_FPQ.len;
 277     if (transfer > 0) {
 278         NETPRI;
 279         MUTEX_ENTER(&rx_freePktQ_lock);
 280         transfer = MAX(transfer, rx_TSFPQGlobSize);
 281         if (transfer > rx_nFreePackets) {
 282             /* alloc enough for us, plus a few globs for other threads */
 283             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 284         }
 285
 286         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 287
 288         MUTEX_EXIT(&rx_freePktQ_lock);
 289         USERPRI;
 290     }
 291
 292     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 293
 294     return num_pkts;
 295 }
 296 #else /* RX_ENABLE_TSFPQ */
 297 static int
 298 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 299 {
 300     struct rx_packet *c;
 301     int i;
 302 #ifdef KERNEL
 303     int overq = 0;
 304 #endif
 305     SPLVAR;
 306
 307     NETPRI;
 308
 309     MUTEX_ENTER(&rx_freePktQ_lock);
 310
 311 #ifdef KERNEL
 312     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 313          num_pkts--, overq++);
 314
 315     if (overq) {
 316         rxi_NeedMorePackets = TRUE;
 317         if (rx_stats_active) {
 318             switch (class) {
 319             case RX_PACKET_CLASS_RECEIVE:
 320                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 321                 break;
 322             case RX_PACKET_CLASS_SEND:
 323                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 324                 break;
 325             case RX_PACKET_CLASS_SPECIAL:
 326                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 327                 break;
 328             case RX_PACKET_CLASS_RECV_CBUF:
 329                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 330                 break;
 331             case RX_PACKET_CLASS_SEND_CBUF:
 332                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 333                 break;
 334             }
 335         }
 336     }
 337
 338     if (rx_nFreePackets < num_pkts)
 339         num_pkts = rx_nFreePackets;
 340
 341     if (!num_pkts) {
 342         rxi_NeedMorePackets = TRUE;
 343         goto done;
 344     }
 345 #else /* KERNEL */
 346     if (rx_nFreePackets < num_pkts) {
 347         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 348     }
 349 #endif /* KERNEL */
 350
 351     for (i=0, c=opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
 352          i < num_pkts;
 353          i++, c=opr_queue_Next(&c->entry, struct rx_packet, entry)) {
 354         RX_FPQ_MARK_USED(c);
 355     }
 356
 357     opr_queue_SplitBeforeAppend(&rx_freePacketQueue, q, &c->entry);
 358
 359     rx_nFreePackets -= num_pkts;
 360
 361 #ifdef KERNEL
 362   done:
 363 #endif
 364     MUTEX_EXIT(&rx_freePktQ_lock);
 365
 366     USERPRI;
 367     return num_pkts;
 368 }
 369 #endif /* RX_ENABLE_TSFPQ */
 370
 371 /*
 372  * Free a packet currently used as a continuation buffer
 373  */
 374 #ifdef RX_ENABLE_TSFPQ
 375 /* num_pkts=0 means queue length is unknown */
 376 int
 377 rxi_FreePackets(int num_pkts, struct opr_queue * q)
 378 {
 379     struct rx_ts_info_t * rx_ts_info;
 380     struct opr_queue *cursor, *store;
 381     SPLVAR;
 382
 383     osi_Assert(num_pkts >= 0);
 384     RX_TS_INFO_GET(rx_ts_info);
 385
 386     if (!num_pkts) {
 387         for (opr_queue_ScanSafe(q, cursor, store)) {
 388             num_pkts++;
 389             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 390                                                  entry), 2, 0);
 391         }
 392     } else {
 393         for (opr_queue_ScanSafe(q, cursor, store)) {
 394             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 395                                                  entry), 2, 0);
 396         }
 397     }
 398
 399     if (num_pkts) {
 400         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 401     }
 402
 403     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 404         NETPRI;
 405         MUTEX_ENTER(&rx_freePktQ_lock);
 406
 407         RX_TS_FPQ_LTOG(rx_ts_info);
 408
 409         /* Wakeup anyone waiting for packets */
 410         rxi_PacketsUnWait();
 411
 412         MUTEX_EXIT(&rx_freePktQ_lock);
 413         USERPRI;
 414     }
 415
 416     return num_pkts;
 417 }
 418 #else /* RX_ENABLE_TSFPQ */
 419 /* num_pkts=0 means queue length is unknown */
 420 int
 421 rxi_FreePackets(int num_pkts, struct opr_queue *q)
 422 {
 423     struct opr_queue cbs;
 424     struct opr_queue *cursor, *store;
 425     int qlen = 0;
 426     SPLVAR;
 427
 428     osi_Assert(num_pkts >= 0);
 429     opr_queue_Init(&cbs);
 430
 431     if (!num_pkts) {
 432         for (opr_queue_ScanSafe(q, cursor, store)) {
 433             struct rx_packet *p
 434                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 435             if (p->niovecs > 2) {
 436                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 437             }
 438             RX_FPQ_MARK_FREE(p);
 439             num_pkts++;
 440         }
 441         if (!num_pkts)
 442             return 0;
 443     } else {
 444         for (opr_queue_ScanSafe(q, cursor, store)) {
 445             struct rx_packet *p
 446                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 447
 448             if (p->niovecs > 2) {
 449                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 450             }
 451             RX_FPQ_MARK_FREE(p);
 452         }
 453     }
 454
 455     if (qlen) {
 456         opr_queue_SpliceAppend(q, &cbs);
 457         qlen += num_pkts;
 458     } else
 459         qlen = num_pkts;
 460
 461     NETPRI;
 462     MUTEX_ENTER(&rx_freePktQ_lock);
 463
 464     opr_queue_SpliceAppend(&rx_freePacketQueue, q);
 465     rx_nFreePackets += qlen;
 466
 467     /* Wakeup anyone waiting for packets */
 468     rxi_PacketsUnWait();
 469
 470     MUTEX_EXIT(&rx_freePktQ_lock);
 471     USERPRI;
 472
 473     return num_pkts;
 474 }
 475 #endif /* RX_ENABLE_TSFPQ */
 476
 477 /* this one is kind of awful.
 478  * In rxkad, the packet has been all shortened, and everything, ready for
 479  * sending.  All of a sudden, we discover we need some of that space back.
 480  * This isn't terribly general, because it knows that the packets are only
 481  * rounded up to the EBS (userdata + security header).
 482  */
 483 int
 484 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 485 {
 486     int i;
 487     i = p->niovecs - 1;
 488     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 489         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 490             p->wirevec[i].iov_len += nb;
 491             return 0;
 492         }
 493     } else {
 494         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 495             p->wirevec[i].iov_len += nb;
 496             return 0;
 497         }
 498     }
 499
 500     return 0;
 501 }
 502
 503 /* get sufficient space to store nb bytes of data (or more), and hook
 504  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 505  * returns the number of bytes >0 which it failed to come up with.
 506  * Don't need to worry about locking on packet, since only
 507  * one thread can manipulate one at a time. Locking on continution
 508  * packets is handled by AllocPacketBufs */
 509 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 510 int
 511 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 512 {
 513     int i, nv;
 514     struct opr_queue q, *cursor, *store;
 515
 516     /* compute the number of cbuf's we need */
 517     nv = nb / RX_CBUFFERSIZE;
 518     if ((nv * RX_CBUFFERSIZE) < nb)
 519         nv++;
 520     if ((nv + p->niovecs) > RX_MAXWVECS)
 521         nv = RX_MAXWVECS - p->niovecs;
 522     if (nv < 1)
 523         return nb;
 524
 525     /* allocate buffers */
 526     opr_queue_Init(&q);
 527     nv = AllocPacketBufs(class, nv, &q);
 528
 529     /* setup packet iovs */
 530     i = p ->niovecs;
 531     for (opr_queue_ScanSafe(&q, cursor, store)) {
 532         struct rx_packet *cb
 533             = opr_queue_Entry(cursor, struct rx_packet, entry);
 534
 535         opr_queue_Remove(&cb->entry);
 536         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 537         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 538         i++;
 539     }
 540
 541     nb -= (nv * RX_CBUFFERSIZE);
 542     p->length += (nv * RX_CBUFFERSIZE);
 543     p->niovecs += nv;
 544
 545     return nb;
 546 }
 547
 548 /**
 549  * Register allocated packets.
 550  *
 551  * @param[in] addr array of packets
 552  * @param[in] npkt number of packets
 553  *
 554  * @return none
 555  */
 556 static void
 557 registerPackets(struct rx_packet *addr, afs_uint32 npkt)
 558 {
 559     struct rx_mallocedPacket *mp;
 560
 561     mp = osi_Alloc(sizeof(*mp));
 562
 563     osi_Assert(mp != NULL);
 564     memset(mp, 0, sizeof(*mp));
 565
 566     mp->addr = addr;
 567     mp->size = npkt * sizeof(struct rx_packet);
 568     osi_Assert(npkt <= MAX_AFS_UINT32 / sizeof(struct rx_packet));
 569
 570     MUTEX_ENTER(&rx_mallocedPktQ_lock);
 571     opr_queue_Append(&rx_mallocedPacketQueue, &mp->entry);
 572     MUTEX_EXIT(&rx_mallocedPktQ_lock);
 573 }
 574
 575 /* Add more packet buffers */
 576 #ifdef RX_ENABLE_TSFPQ
 577 void
 578 rxi_MorePackets(int apackets)
 579 {
 580     struct rx_packet *p, *e;
 581     struct rx_ts_info_t * rx_ts_info;
 582     int getme;
 583     SPLVAR;
 584
 585     getme = apackets * sizeof(struct rx_packet);
 586     p = osi_Alloc(getme);
 587     osi_Assert(p);
 588     registerPackets(p, apackets);
 589
 590     PIN(p, getme);              /* XXXXX */
 591     memset(p, 0, getme);
 592     RX_TS_INFO_GET(rx_ts_info);
 593
 594     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 595     /* TSFPQ patch also needs to keep track of total packets */
 596
 597     MUTEX_ENTER(&rx_packets_mutex);
 598     rx_nPackets += apackets;
 599     RX_TS_FPQ_COMPUTE_LIMITS;
 600     MUTEX_EXIT(&rx_packets_mutex);
 601
 602     for (e = p + apackets; p < e; p++) {
 603         RX_PACKET_IOV_INIT(p);
 604         p->niovecs = 2;
 605
 606         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 607
 608         NETPRI;
 609         MUTEX_ENTER(&rx_freePktQ_lock);
 610 #ifdef RXDEBUG_PACKET
 611         p->packetId = rx_packet_id++;
 612         p->allNextp = rx_mallocedP;
 613 #endif /* RXDEBUG_PACKET */
 614         rx_mallocedP = p;
 615         MUTEX_EXIT(&rx_freePktQ_lock);
 616         USERPRI;
 617     }
 618     rx_ts_info->_FPQ.delta += apackets;
 619
 620     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 621         NETPRI;
 622         MUTEX_ENTER(&rx_freePktQ_lock);
 623
 624         RX_TS_FPQ_LTOG(rx_ts_info);
 625         rxi_NeedMorePackets = FALSE;
 626         rxi_PacketsUnWait();
 627
 628         MUTEX_EXIT(&rx_freePktQ_lock);
 629         USERPRI;
 630     }
 631 }
 632 #else /* RX_ENABLE_TSFPQ */
 633 void
 634 rxi_MorePackets(int apackets)
 635 {
 636     struct rx_packet *p, *e;
 637     int getme;
 638     SPLVAR;
 639
 640     getme = apackets * sizeof(struct rx_packet);
 641     p = osi_Alloc(getme);
 642     osi_Assert(p);
 643     registerPackets(p, apackets);
 644
 645     PIN(p, getme);              /* XXXXX */
 646     memset(p, 0, getme);
 647     NETPRI;
 648     MUTEX_ENTER(&rx_freePktQ_lock);
 649
 650     for (e = p + apackets; p < e; p++) {
 651         RX_PACKET_IOV_INIT(p);
 652 #ifdef RX_TRACK_PACKETS
 653         p->flags |= RX_PKTFLAG_FREE;
 654 #endif
 655         p->niovecs = 2;
 656
 657         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 658 #ifdef RXDEBUG_PACKET
 659         p->packetId = rx_packet_id++;
 660         p->allNextp = rx_mallocedP;
 661 #endif /* RXDEBUG_PACKET */
 662         rx_mallocedP = p;
 663     }
 664
 665     rx_nPackets += apackets;
 666     rx_nFreePackets += apackets;
 667     rxi_NeedMorePackets = FALSE;
 668     rxi_PacketsUnWait();
 669
 670     MUTEX_EXIT(&rx_freePktQ_lock);
 671     USERPRI;
 672 }
 673 #endif /* RX_ENABLE_TSFPQ */
 674
 675 #ifdef RX_ENABLE_TSFPQ
 676 void
 677 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 678 {
 679     struct rx_packet *p, *e;
 680     struct rx_ts_info_t * rx_ts_info;
 681     int getme;
 682     SPLVAR;
 683
 684     getme = apackets * sizeof(struct rx_packet);
 685     p = osi_Alloc(getme);
 686     registerPackets(p, apackets);
 687
 688     PIN(p, getme);              /* XXXXX */
 689     memset(p, 0, getme);
 690     RX_TS_INFO_GET(rx_ts_info);
 691
 692     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 693     /* TSFPQ patch also needs to keep track of total packets */
 694     MUTEX_ENTER(&rx_packets_mutex);
 695     rx_nPackets += apackets;
 696     RX_TS_FPQ_COMPUTE_LIMITS;
 697     MUTEX_EXIT(&rx_packets_mutex);
 698
 699     for (e = p + apackets; p < e; p++) {
 700         RX_PACKET_IOV_INIT(p);
 701         p->niovecs = 2;
 702         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 703
 704         NETPRI;
 705         MUTEX_ENTER(&rx_freePktQ_lock);
 706 #ifdef RXDEBUG_PACKET
 707         p->packetId = rx_packet_id++;
 708         p->allNextp = rx_mallocedP;
 709 #endif /* RXDEBUG_PACKET */
 710         rx_mallocedP = p;
 711         MUTEX_EXIT(&rx_freePktQ_lock);
 712         USERPRI;
 713     }
 714     rx_ts_info->_FPQ.delta += apackets;
 715
 716     if (flush_global &&
 717         (num_keep_local < apackets)) {
 718         NETPRI;
 719         MUTEX_ENTER(&rx_freePktQ_lock);
 720
 721         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 722         rxi_NeedMorePackets = FALSE;
 723         rxi_PacketsUnWait();
 724
 725         MUTEX_EXIT(&rx_freePktQ_lock);
 726         USERPRI;
 727     }
 728 }
 729 #endif /* RX_ENABLE_TSFPQ */
 730
 731 #ifndef KERNEL
 732 /* Add more packet buffers */
 733 static void
 734 rxi_MorePacketsNoLock(int apackets)
 735 {
 736 #ifdef RX_ENABLE_TSFPQ
 737     struct rx_ts_info_t * rx_ts_info;
 738 #endif /* RX_ENABLE_TSFPQ */
 739     struct rx_packet *p, *e;
 740     int getme;
 741
 742     /* allocate enough packets that 1/4 of the packets will be able
 743      * to hold maximal amounts of data */
 744     apackets += (apackets / 4)
 745         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 746     do {
 747         getme = apackets * sizeof(struct rx_packet);
 748         p = osi_Alloc(getme);
 749         if (p == NULL) {
 750             apackets -= apackets / 4;
 751             osi_Assert(apackets > 0);
 752         }
 753     } while(p == NULL);
 754     memset(p, 0, getme);
 755     registerPackets(p, apackets);
 756
 757 #ifdef RX_ENABLE_TSFPQ
 758     RX_TS_INFO_GET(rx_ts_info);
 759     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 760 #endif /* RX_ENABLE_TSFPQ */
 761
 762     for (e = p + apackets; p < e; p++) {
 763         RX_PACKET_IOV_INIT(p);
 764 #ifdef RX_TRACK_PACKETS
 765         p->flags |= RX_PKTFLAG_FREE;
 766 #endif
 767         p->niovecs = 2;
 768
 769         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 770 #ifdef RXDEBUG_PACKET
 771         p->packetId = rx_packet_id++;
 772         p->allNextp = rx_mallocedP;
 773 #endif /* RXDEBUG_PACKET */
 774         rx_mallocedP = p;
 775     }
 776
 777     rx_nFreePackets += apackets;
 778     MUTEX_ENTER(&rx_packets_mutex);
 779     rx_nPackets += apackets;
 780 #ifdef RX_ENABLE_TSFPQ
 781     RX_TS_FPQ_COMPUTE_LIMITS;
 782 #endif /* RX_ENABLE_TSFPQ */
 783     MUTEX_EXIT(&rx_packets_mutex);
 784     rxi_NeedMorePackets = FALSE;
 785     rxi_PacketsUnWait();
 786 }
 787 #endif /* !KERNEL */
 788
 789 void
 790 rxi_FreeAllPackets(void)
 791 {
 792     struct rx_mallocedPacket *mp;
 793
 794     MUTEX_ENTER(&rx_mallocedPktQ_lock);
 795
 796     while (!opr_queue_IsEmpty(&rx_mallocedPacketQueue)) {
 797         mp = opr_queue_First(&rx_mallocedPacketQueue,
 798                              struct rx_mallocedPacket, entry);
 799         opr_queue_Remove(&mp->entry);
 800         osi_Free(mp->addr, mp->size);
 801         UNPIN(mp->addr, mp->size);
 802         osi_Free(mp, sizeof(*mp));
 803     }
 804     MUTEX_EXIT(&rx_mallocedPktQ_lock);
 805 }
 806
 807 #ifdef RX_ENABLE_TSFPQ
 808 static void
 809 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 810 {
 811     struct rx_ts_info_t * rx_ts_info;
 812     int xfer;
 813     SPLVAR;
 814
 815     RX_TS_INFO_GET(rx_ts_info);
 816
 817     if (num_keep_local != rx_ts_info->_FPQ.len) {
 818         NETPRI;
 819         MUTEX_ENTER(&rx_freePktQ_lock);
 820         if (num_keep_local < rx_ts_info->_FPQ.len) {
 821             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 822             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 823             rxi_PacketsUnWait();
 824         } else {
 825             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 826             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 827                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 828             if (rx_nFreePackets < xfer) {
 829                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 830             }
 831             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 832         }
 833         MUTEX_EXIT(&rx_freePktQ_lock);
 834         USERPRI;
 835     }
 836 }
 837
 838 void
 839 rxi_FlushLocalPacketsTSFPQ(void)
 840 {
 841     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 842 }
 843 #endif /* RX_ENABLE_TSFPQ */
 844
 845 /* Allocate more packets iff we need more continuation buffers */
 846 /* In kernel, can't page in memory with interrupts disabled, so we
 847  * don't use the event mechanism. */
 848 void
 849 rx_CheckPackets(void)
 850 {
 851     if (rxi_NeedMorePackets) {
 852         rxi_MorePackets(rx_maxSendWindow);
 853     }
 854 }
 855
 856 /* In the packet freeing routine below, the assumption is that
 857    we want all of the packets to be used equally frequently, so that we
 858    don't get packet buffers paging out.  It would be just as valid to
 859    assume that we DO want them to page out if not many are being used.
 860    In any event, we assume the former, and append the packets to the end
 861    of the free list.  */
 862 /* This explanation is bogus.  The free list doesn't remain in any kind of
 863    useful order for afs_int32: the packets in use get pretty much randomly scattered
 864    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 865    must be stored so that packets which are adjacent in memory are adjacent in the
 866    free list.  An array springs rapidly to mind.
 867    */
 868
 869 /* Actually free the packet p. */
 870 #ifndef RX_ENABLE_TSFPQ
 871 static void
 872 rxi_FreePacketNoLock(struct rx_packet *p)
 873 {
 874     dpf(("Free %"AFS_PTR_FMT"\n", p));
 875
 876     RX_FPQ_MARK_FREE(p);
 877     rx_nFreePackets++;
 878     opr_queue_Append(&rx_freePacketQueue, &p->entry);
 879 }
 880 #endif /* RX_ENABLE_TSFPQ */
 881
 882 #ifdef RX_ENABLE_TSFPQ
 883 static void
 884 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 885 {
 886     struct rx_ts_info_t * rx_ts_info;
 887     dpf(("Free %"AFS_PTR_FMT"\n", p));
 888
 889     RX_TS_INFO_GET(rx_ts_info);
 890     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 891
 892     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 893         NETPRI;
 894         MUTEX_ENTER(&rx_freePktQ_lock);
 895
 896         RX_TS_FPQ_LTOG(rx_ts_info);
 897
 898         /* Wakeup anyone waiting for packets */
 899         rxi_PacketsUnWait();
 900
 901         MUTEX_EXIT(&rx_freePktQ_lock);
 902         USERPRI;
 903     }
 904 }
 905 #endif /* RX_ENABLE_TSFPQ */
 906
 907 /*
 908  * free continuation buffers off a packet into a queue
 909  *
 910  * [IN] p      -- packet from which continuation buffers will be freed
 911  * [IN] first  -- iovec offset of first continuation buffer to free
 912  * [IN] q      -- queue into which continuation buffers will be chained
 913  *
 914  * returns:
 915  *   number of continuation buffers freed
 916  */
 917 #ifndef RX_ENABLE_TSFPQ
 918 static int
 919 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct opr_queue * q)
 920 {
 921     struct iovec *iov;
 922     struct rx_packet * cb;
 923     int count = 0;
 924
 925     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 926         iov = &p->wirevec[first];
 927         if (!iov->iov_base)
 928             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 929         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 930         RX_FPQ_MARK_FREE(cb);
 931         opr_queue_Append(q, &cb->entry);
 932     }
 933     p->length = 0;
 934     p->niovecs = 0;
 935
 936     return count;
 937 }
 938
 939 /*
 940  * free packet continuation buffers into the global free packet pool
 941  *
 942  * [IN] p      -- packet from which to free continuation buffers
 943  * [IN] first  -- iovec offset of first continuation buffer to free
 944  *
 945  * returns:
 946  *   zero always
 947  */
 948 static int
 949 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 950 {
 951     struct iovec *iov;
 952
 953     for (first = MAX(2, first); first < p->niovecs; first++) {
 954         iov = &p->wirevec[first];
 955         if (!iov->iov_base)
 956             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 957         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 958     }
 959     p->length = 0;
 960     p->niovecs = 0;
 961
 962     return 0;
 963 }
 964
 965 #else
 966
 967 /*
 968  * free packet continuation buffers into the thread-local free pool
 969  *
 970  * [IN] p             -- packet from which continuation buffers will be freed
 971  * [IN] first         -- iovec offset of first continuation buffer to free
 972  *                       any value less than 2, the min number of iovecs,
 973  *                       is treated as if it is 2.
 974  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 975  *                       global free pool before returning
 976  *
 977  * returns:
 978  *   zero always
 979  */
 980 static int
 981 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 982 {
 983     struct iovec *iov;
 984     struct rx_ts_info_t * rx_ts_info;
 985
 986     RX_TS_INFO_GET(rx_ts_info);
 987
 988     for (first = MAX(2, first); first < p->niovecs; first++) {
 989         iov = &p->wirevec[first];
 990         if (!iov->iov_base)
 991             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 992         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 993     }
 994     p->length = 0;
 995     p->niovecs = 0;
 996
 997     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 998         NETPRI;
 999         MUTEX_ENTER(&rx_freePktQ_lock);
1000
1001         RX_TS_FPQ_LTOG(rx_ts_info);
1002
1003         /* Wakeup anyone waiting for packets */
1004         rxi_PacketsUnWait();
1005
1006         MUTEX_EXIT(&rx_freePktQ_lock);
1007         USERPRI;
1008     }
1009     return 0;
1010 }
1011 #endif /* RX_ENABLE_TSFPQ */
1012
1013 int rxi_nBadIovecs = 0;
1014
1015 /* rxi_RestoreDataBufs
1016  *
1017  * Restore the correct sizes to the iovecs. Called when reusing a packet
1018  * for reading off the wire.
1019  */
1020 void
1021 rxi_RestoreDataBufs(struct rx_packet *p)
1022 {
1023     unsigned int i;
1024     struct iovec *iov;
1025
1026     RX_PACKET_IOV_INIT(p);
1027
1028     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
1029         if (!iov->iov_base) {
1030             rxi_nBadIovecs++;
1031             p->niovecs = i;
1032             break;
1033         }
1034         iov->iov_len = RX_CBUFFERSIZE;
1035     }
1036 }
1037
1038 #ifdef RX_ENABLE_TSFPQ
1039 int
1040 rxi_TrimDataBufs(struct rx_packet *p, int first)
1041 {
1042     int length;
1043     struct iovec *iov, *end;
1044     struct rx_ts_info_t * rx_ts_info;
1045     SPLVAR;
1046
1047     if (first != 1)
1048         osi_Panic("TrimDataBufs 1: first must be 1");
1049
1050     /* Skip over continuation buffers containing message data */
1051     iov = &p->wirevec[2];
1052     end = iov + (p->niovecs - 2);
1053     length = p->length - p->wirevec[1].iov_len;
1054     for (; iov < end && length > 0; iov++) {
1055         if (!iov->iov_base)
1056             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1057         length -= iov->iov_len;
1058     }
1059
1060     /* iov now points to the first empty data buffer. */
1061     if (iov >= end)
1062         return 0;
1063
1064     RX_TS_INFO_GET(rx_ts_info);
1065     for (; iov < end; iov++) {
1066         if (!iov->iov_base)
1067             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1068         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1069         p->niovecs--;
1070     }
1071     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1072         NETPRI;
1073         MUTEX_ENTER(&rx_freePktQ_lock);
1074
1075         RX_TS_FPQ_LTOG(rx_ts_info);
1076         rxi_PacketsUnWait();
1077
1078         MUTEX_EXIT(&rx_freePktQ_lock);
1079         USERPRI;
1080     }
1081
1082     return 0;
1083 }
1084 #else /* RX_ENABLE_TSFPQ */
1085 int
1086 rxi_TrimDataBufs(struct rx_packet *p, int first)
1087 {
1088     int length;
1089     struct iovec *iov, *end;
1090     SPLVAR;
1091
1092     if (first != 1)
1093         osi_Panic("TrimDataBufs 1: first must be 1");
1094
1095     /* Skip over continuation buffers containing message data */
1096     iov = &p->wirevec[2];
1097     end = iov + (p->niovecs - 2);
1098     length = p->length - p->wirevec[1].iov_len;
1099     for (; iov < end && length > 0; iov++) {
1100         if (!iov->iov_base)
1101             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1102         length -= iov->iov_len;
1103     }
1104
1105     /* iov now points to the first empty data buffer. */
1106     if (iov >= end)
1107         return 0;
1108
1109     NETPRI;
1110     MUTEX_ENTER(&rx_freePktQ_lock);
1111
1112     for (; iov < end; iov++) {
1113         if (!iov->iov_base)
1114             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1115         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1116         p->niovecs--;
1117     }
1118     rxi_PacketsUnWait();
1119
1120     MUTEX_EXIT(&rx_freePktQ_lock);
1121     USERPRI;
1122
1123     return 0;
1124 }
1125 #endif /* RX_ENABLE_TSFPQ */
1126
1127 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1128  * remove it yourself first if you call this routine. */
1129 #ifdef RX_ENABLE_TSFPQ
1130 void
1131 rxi_FreePacket(struct rx_packet *p)
1132 {
1133     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1134     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1135 }
1136 #else /* RX_ENABLE_TSFPQ */
1137 void
1138 rxi_FreePacket(struct rx_packet *p)
1139 {
1140     SPLVAR;
1141
1142     NETPRI;
1143     MUTEX_ENTER(&rx_freePktQ_lock);
1144
1145     rxi_FreeDataBufsNoLock(p, 2);
1146     rxi_FreePacketNoLock(p);
1147     /* Wakeup anyone waiting for packets */
1148     rxi_PacketsUnWait();
1149
1150     MUTEX_EXIT(&rx_freePktQ_lock);
1151     USERPRI;
1152 }
1153 #endif /* RX_ENABLE_TSFPQ */
1154
1155 /* rxi_AllocPacket sets up p->length so it reflects the number of
1156  * bytes in the packet at this point, **not including** the header.
1157  * The header is absolutely necessary, besides, this is the way the
1158  * length field is usually used */
1159 #ifdef RX_ENABLE_TSFPQ
1160 static struct rx_packet *
1161 rxi_AllocPacketNoLock(int class)
1162 {
1163     struct rx_packet *p;
1164     struct rx_ts_info_t * rx_ts_info;
1165
1166     RX_TS_INFO_GET(rx_ts_info);
1167
1168     if (rx_stats_active)
1169         rx_atomic_inc(&rx_stats.packetRequests);
1170     if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1171
1172 #ifdef KERNEL
1173         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1174             osi_Panic("rxi_AllocPacket error");
1175 #else /* KERNEL */
1176         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1177             rxi_MorePacketsNoLock(rx_maxSendWindow);
1178 #endif /* KERNEL */
1179
1180
1181         RX_TS_FPQ_GTOL(rx_ts_info);
1182     }
1183
1184     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1185
1186     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1187
1188
1189     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1190      * order to truncate outbound packets.  In the near future, may need
1191      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1192      */
1193     RX_PACKET_IOV_FULLINIT(p);
1194     return p;
1195 }
1196 #else /* RX_ENABLE_TSFPQ */
1197 static struct rx_packet *
1198 rxi_AllocPacketNoLock(int class)
1199 {
1200     struct rx_packet *p;
1201
1202 #ifdef KERNEL
1203     if (rxi_OverQuota(class)) {
1204         rxi_NeedMorePackets = TRUE;
1205         if (rx_stats_active) {
1206             switch (class) {
1207             case RX_PACKET_CLASS_RECEIVE:
1208                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1209                 break;
1210             case RX_PACKET_CLASS_SEND:
1211                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1212                 break;
1213             case RX_PACKET_CLASS_SPECIAL:
1214                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1215                 break;
1216             case RX_PACKET_CLASS_RECV_CBUF:
1217                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1218                 break;
1219             case RX_PACKET_CLASS_SEND_CBUF:
1220                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1221                 break;
1222             }
1223         }
1224         return (struct rx_packet *)0;
1225     }
1226 #endif /* KERNEL */
1227
1228     if (rx_stats_active)
1229         rx_atomic_inc(&rx_stats.packetRequests);
1230
1231 #ifdef KERNEL
1232     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1233         osi_Panic("rxi_AllocPacket error");
1234 #else /* KERNEL */
1235     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1236         rxi_MorePacketsNoLock(rx_maxSendWindow);
1237 #endif /* KERNEL */
1238
1239     rx_nFreePackets--;
1240     p = opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
1241     opr_queue_Remove(&p->entry);
1242     RX_FPQ_MARK_USED(p);
1243
1244     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1245
1246
1247     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1248      * order to truncate outbound packets.  In the near future, may need
1249      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1250      */
1251     RX_PACKET_IOV_FULLINIT(p);
1252     return p;
1253 }
1254 #endif /* RX_ENABLE_TSFPQ */
1255
1256 #ifdef RX_ENABLE_TSFPQ
1257 static struct rx_packet *
1258 rxi_AllocPacketTSFPQ(int class, int pull_global)
1259 {
1260     struct rx_packet *p;
1261     struct rx_ts_info_t * rx_ts_info;
1262
1263     RX_TS_INFO_GET(rx_ts_info);
1264
1265     if (rx_stats_active)
1266         rx_atomic_inc(&rx_stats.packetRequests);
1267     if (pull_global && opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1268         MUTEX_ENTER(&rx_freePktQ_lock);
1269
1270         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1271             rxi_MorePacketsNoLock(rx_maxSendWindow);
1272
1273         RX_TS_FPQ_GTOL(rx_ts_info);
1274
1275         MUTEX_EXIT(&rx_freePktQ_lock);
1276     } else if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1277         return NULL;
1278     }
1279
1280     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1281
1282     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1283
1284     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1285      * order to truncate outbound packets.  In the near future, may need
1286      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1287      */
1288     RX_PACKET_IOV_FULLINIT(p);
1289     return p;
1290 }
1291 #endif /* RX_ENABLE_TSFPQ */
1292
1293 #ifdef RX_ENABLE_TSFPQ
1294 struct rx_packet *
1295 rxi_AllocPacket(int class)
1296 {
1297     struct rx_packet *p;
1298
1299     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1300     return p;
1301 }
1302 #else /* RX_ENABLE_TSFPQ */
1303 struct rx_packet *
1304 rxi_AllocPacket(int class)
1305 {
1306     struct rx_packet *p;
1307
1308     MUTEX_ENTER(&rx_freePktQ_lock);
1309     p = rxi_AllocPacketNoLock(class);
1310     MUTEX_EXIT(&rx_freePktQ_lock);
1311     return p;
1312 }
1313 #endif /* RX_ENABLE_TSFPQ */
1314
1315 /* This guy comes up with as many buffers as it {takes,can get} given
1316  * the MTU for this call. It also sets the packet length before
1317  * returning.  caution: this is often called at NETPRI
1318  * Called with call locked.
1319  */
1320 struct rx_packet *
1321 rxi_AllocSendPacket(struct rx_call *call, int want)
1322 {
1323     struct rx_packet *p = (struct rx_packet *)0;
1324     int mud;
1325     unsigned delta;
1326
1327     SPLVAR;
1328     mud = call->MTU - RX_HEADER_SIZE;
1329     delta =
1330         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1331         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1332
1333 #ifdef RX_ENABLE_TSFPQ
1334     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1335         want += delta;
1336         want = MIN(want, mud);
1337
1338         if ((unsigned)want > p->length)
1339             (void)rxi_AllocDataBuf(p, (want - p->length),
1340                                    RX_PACKET_CLASS_SEND_CBUF);
1341
1342         if (p->length > mud)
1343             p->length = mud;
1344
1345         if (delta >= p->length) {
1346             rxi_FreePacket(p);
1347             p = NULL;
1348         } else {
1349             p->length -= delta;
1350         }
1351         return p;
1352     }
1353 #endif /* RX_ENABLE_TSFPQ */
1354
1355     while (!(call->error)) {
1356         MUTEX_ENTER(&rx_freePktQ_lock);
1357         /* if an error occurred, or we get the packet we want, we're done */
1358         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1359             MUTEX_EXIT(&rx_freePktQ_lock);
1360
1361             want += delta;
1362             want = MIN(want, mud);
1363
1364             if ((unsigned)want > p->length)
1365                 (void)rxi_AllocDataBuf(p, (want - p->length),
1366                                        RX_PACKET_CLASS_SEND_CBUF);
1367
1368             if (p->length > mud)
1369                 p->length = mud;
1370
1371             if (delta >= p->length) {
1372                 rxi_FreePacket(p);
1373                 p = NULL;
1374             } else {
1375                 p->length -= delta;
1376             }
1377             break;
1378         }
1379
1380         /* no error occurred, and we didn't get a packet, so we sleep.
1381          * At this point, we assume that packets will be returned
1382          * sooner or later, as packets are acknowledged, and so we
1383          * just wait.  */
1384         NETPRI;
1385         call->flags |= RX_CALL_WAIT_PACKETS;
1386         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1387         MUTEX_EXIT(&call->lock);
1388         rx_waitingForPackets = 1;
1389
1390 #ifdef  RX_ENABLE_LOCKS
1391         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1392 #else
1393         osi_rxSleep(&rx_waitingForPackets);
1394 #endif
1395         MUTEX_EXIT(&rx_freePktQ_lock);
1396         MUTEX_ENTER(&call->lock);
1397         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1398         call->flags &= ~RX_CALL_WAIT_PACKETS;
1399         USERPRI;
1400     }
1401
1402     return p;
1403 }
1404
1405 #ifndef KERNEL
1406 #ifdef AFS_NT40_ENV
1407 /* Windows does not use file descriptors. */
1408 #define CountFDs(amax) 0
1409 #else
1410 /* count the number of used FDs */
1411 static int
1412 CountFDs(int amax)
1413 {
1414     struct stat tstat;
1415     int i, code;
1416     int count;
1417
1418     count = 0;
1419     for (i = 0; i < amax; i++) {
1420         code = fstat(i, &tstat);
1421         if (code == 0)
1422             count++;
1423     }
1424     return count;
1425 }
1426 #endif /* AFS_NT40_ENV */
1427 #else /* KERNEL */
1428
1429 #define CountFDs(amax) amax
1430
1431 #endif /* KERNEL */
1432
1433 #if !defined(KERNEL) || defined(UKERNEL)
1434
1435 /* This function reads a single packet from the interface into the
1436  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1437  * (host,port) of the sender are stored in the supplied variables, and
1438  * the data length of the packet is stored in the packet structure.
1439  * The header is decoded. */
1440 int
1441 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1442                u_short * port)
1443 {
1444     struct sockaddr_in from;
1445     int nbytes;
1446     afs_int32 rlen;
1447     afs_uint32 tlen, savelen;
1448     struct msghdr msg;
1449     rx_computelen(p, tlen);
1450     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1451
1452     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1453     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1454                                  * it once in order to avoid races.  */
1455     tlen = rlen - tlen;
1456     if (tlen > 0) {
1457         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1458         if (tlen > 0) {
1459             tlen = rlen - tlen;
1460         } else
1461             tlen = rlen;
1462     } else
1463         tlen = rlen;
1464
1465     /* Extend the last iovec for padding, it's just to make sure that the
1466      * read doesn't return more data than we expect, and is done to get around
1467      * our problems caused by the lack of a length field in the rx header.
1468      * Use the extra buffer that follows the localdata in each packet
1469      * structure. */
1470     savelen = p->wirevec[p->niovecs - 1].iov_len;
1471     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1472
1473     memset(&msg, 0, sizeof(msg));
1474     msg.msg_name = (char *)&from;
1475     msg.msg_namelen = sizeof(struct sockaddr_in);
1476     msg.msg_iov = p->wirevec;
1477     msg.msg_iovlen = p->niovecs;
1478     nbytes = rxi_Recvmsg(socket, &msg, 0);
1479
1480     /* restore the vec to its correct state */
1481     p->wirevec[p->niovecs - 1].iov_len = savelen;
1482
1483     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1484     if (nbytes < 0 || (nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
1485         if (nbytes < 0 && errno == EWOULDBLOCK) {
1486             if (rx_stats_active)
1487                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1488         } else if (nbytes <= 0) {
1489             if (rx_stats_active) {
1490                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1491                 rx_stats.bogusHost = from.sin_addr.s_addr;
1492             }
1493             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1494                  ntohs(from.sin_port), nbytes));
1495         }
1496         return 0;
1497     }
1498 #ifdef RXDEBUG
1499     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1500                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1501         rxi_DecodePacketHeader(p);
1502
1503         *host = from.sin_addr.s_addr;
1504         *port = from.sin_port;
1505
1506         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1507               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1508               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1509               p->length));
1510 #ifdef RX_TRIMDATABUFS
1511         rxi_TrimDataBufs(p, 1);
1512 #endif
1513         return 0;
1514     }
1515 #endif
1516     else {
1517         /* Extract packet header. */
1518         rxi_DecodePacketHeader(p);
1519
1520         *host = from.sin_addr.s_addr;
1521         *port = from.sin_port;
1522         if (rx_stats_active
1523             && p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1524
1525                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1526         }
1527
1528 #ifdef RX_TRIMDATABUFS
1529         /* Free any empty packet buffers at the end of this packet */
1530         rxi_TrimDataBufs(p, 1);
1531 #endif
1532         return 1;
1533     }
1534 }
1535
1536 #endif /* !KERNEL || UKERNEL */
1537
1538 /* This function splits off the first packet in a jumbo packet.
1539  * As of AFS 3.5, jumbograms contain more than one fixed size
1540  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1541  * last packet header. All packets (except the last) are padded to
1542  * fall on RX_CBUFFERSIZE boundaries.
1543  * HACK: We store the length of the first n-1 packets in the
1544  * last two pad bytes. */
1545
1546 struct rx_packet *
1547 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1548                      int first)
1549 {
1550     struct rx_packet *np;
1551     struct rx_jumboHeader *jp;
1552     int niov, i;
1553     struct iovec *iov;
1554     int length;
1555     afs_uint32 temp;
1556
1557     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1558      * bytes in length. All but the first packet are preceded by
1559      * an abbreviated four byte header. The length of the last packet
1560      * is calculated from the size of the jumbogram. */
1561     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1562
1563     if ((int)p->length < length) {
1564         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1565         return NULL;
1566     }
1567     niov = p->niovecs - 2;
1568     if (niov < 1) {
1569         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1570         return NULL;
1571     }
1572     iov = &p->wirevec[2];
1573     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1574
1575     /* Get a pointer to the abbreviated packet header */
1576     jp = (struct rx_jumboHeader *)
1577         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1578
1579     /* Set up the iovecs for the next packet */
1580     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1581     np->wirevec[0].iov_len = sizeof(struct rx_header);
1582     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1583     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1584     np->niovecs = niov + 1;
1585     for (i = 2, iov++; i <= niov; i++, iov++) {
1586         np->wirevec[i] = *iov;
1587     }
1588     np->length = p->length - length;
1589     p->length = RX_JUMBOBUFFERSIZE;
1590     p->niovecs = 2;
1591
1592     /* Convert the jumbo packet header to host byte order */
1593     temp = ntohl(*(afs_uint32 *) jp);
1594     jp->flags = (u_char) (temp >> 24);
1595     jp->cksum = (u_short) (temp);
1596
1597     /* Fill in the packet header */
1598     np->header = p->header;
1599     np->header.serial = p->header.serial + 1;
1600     np->header.seq = p->header.seq + 1;
1601     np->header.flags = jp->flags;
1602     np->header.spare = jp->cksum;
1603
1604     return np;
1605 }
1606
1607 #ifndef KERNEL
1608 /* Send a udp datagram */
1609 int
1610 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1611             int length, int istack)
1612 {
1613     struct msghdr msg;
1614         int ret;
1615
1616     memset(&msg, 0, sizeof(msg));
1617     msg.msg_iov = dvec;
1618     msg.msg_iovlen = nvecs;
1619     msg.msg_name = addr;
1620     msg.msg_namelen = sizeof(struct sockaddr_in);
1621
1622     ret = rxi_Sendmsg(socket, &msg, 0);
1623
1624     return ret;
1625 }
1626 #elif !defined(UKERNEL)
1627 /*
1628  * message receipt is done in rxk_input or rx_put.
1629  */
1630
1631 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1632 /*
1633  * Copy an mblock to the contiguous area pointed to by cp.
1634  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1635  * but it doesn't really.
1636  * Returns the number of bytes not transferred.
1637  * The message is NOT changed.
1638  */
1639 static int
1640 cpytoc(mblk_t * mp, int off, int len, char *cp)
1641 {
1642     int n;
1643
1644     for (; mp && len > 0; mp = mp->b_cont) {
1645         if (mp->b_datap->db_type != M_DATA) {
1646             return -1;
1647         }
1648         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1649         memcpy(cp, (char *)mp->b_rptr, n);
1650         cp += n;
1651         len -= n;
1652         mp->b_rptr += n;
1653     }
1654     return (len);
1655 }
1656
1657 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1658  * but it doesn't really.
1659  * This sucks, anyway, do it like m_cpy.... below
1660  */
1661 static int
1662 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1663            int niovs)
1664 {
1665     int m, n, o, t, i;
1666
1667     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1668         if (mp->b_datap->db_type != M_DATA) {
1669             return -1;
1670         }
1671         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1672         len -= n;
1673         while (n) {
1674             if (!t) {
1675                 o = 0;
1676                 i++;
1677                 t = iovs[i].iov_len;
1678             }
1679             m = MIN(n, t);
1680             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1681             mp->b_rptr += m;
1682             o += m;
1683             t -= m;
1684             n -= m;
1685         }
1686     }
1687     return (len);
1688 }
1689
1690 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1691 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1692 #else
1693 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1694 static int
1695 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1696 {
1697     caddr_t p1, p2;
1698     unsigned int l1, l2, i, t;
1699
1700     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1701         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1702
1703     while (off && m)
1704         if (m->m_len <= off) {
1705             off -= m->m_len;
1706             m = m->m_next;
1707             continue;
1708         } else
1709             break;
1710
1711     if (m == NULL)
1712         return len;
1713
1714     p1 = mtod(m, caddr_t) + off;
1715     l1 = m->m_len - off;
1716     i = 0;
1717     p2 = iovs[0].iov_base;
1718     l2 = iovs[0].iov_len;
1719
1720     while (len) {
1721         t = MIN(l1, MIN(l2, (unsigned int)len));
1722         memcpy(p2, p1, t);
1723         p1 += t;
1724         p2 += t;
1725         l1 -= t;
1726         l2 -= t;
1727         len -= t;
1728         if (!l1) {
1729             m = m->m_next;
1730             if (!m)
1731                 break;
1732             p1 = mtod(m, caddr_t);
1733             l1 = m->m_len;
1734         }
1735         if (!l2) {
1736             if (++i >= niovs)
1737                 break;
1738             p2 = iovs[i].iov_base;
1739             l2 = iovs[i].iov_len;
1740         }
1741
1742     }
1743
1744     return len;
1745 }
1746 #endif /* LINUX */
1747 #endif /* AFS_SUN5_ENV */
1748
1749 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1750 #if defined(AFS_NBSD_ENV)
1751 int
1752 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1753 #else
1754 int
1755 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1756 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1757      mblk_t *amb;
1758 #else
1759      struct mbuf *amb;
1760 #endif
1761      void (*free) ();
1762      struct rx_packet *phandle;
1763      int hdr_len, data_len;
1764 #endif /* AFS_NBSD_ENV */
1765 {
1766     int code;
1767
1768     code =
1769         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1770                      phandle->niovecs);
1771     (*free) (amb);
1772
1773     return code;
1774 }
1775 #endif /* LINUX */
1776 #endif /*KERNEL && !UKERNEL */
1777
1778
1779 /* send a response to a debug packet */
1780
1781 struct rx_packet *
1782 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1783                        afs_uint32 ahost, short aport, int istack)
1784 {
1785     struct rx_debugIn tin;
1786     afs_int32 tl;
1787
1788     /*
1789      * Only respond to client-initiated Rx debug packets,
1790      * and clear the client flag in the response.
1791      */
1792     if (ap->header.flags & RX_CLIENT_INITIATED) {
1793         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1794         rxi_EncodePacketHeader(ap);
1795     } else {
1796         return ap;
1797     }
1798
1799     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1800     /* all done with packet, now set length to the truth, so we can
1801      * reuse this packet */
1802     rx_computelen(ap, ap->length);
1803
1804     tin.type = ntohl(tin.type);
1805     tin.index = ntohl(tin.index);
1806     switch (tin.type) {
1807     case RX_DEBUGI_GETSTATS:{
1808             struct rx_debugStats tstat;
1809
1810             /* get basic stats */
1811             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1812             tstat.version = RX_DEBUGI_VERSION;
1813 #ifndef RX_ENABLE_LOCKS
1814             tstat.waitingForPackets = rx_waitingForPackets;
1815 #endif
1816             MUTEX_ENTER(&rx_serverPool_lock);
1817             tstat.nFreePackets = htonl(rx_nFreePackets);
1818             tstat.nPackets = htonl(rx_nPackets);
1819             tstat.callsExecuted = htonl(rxi_nCalls);
1820             tstat.packetReclaims = htonl(rx_packetReclaims);
1821             tstat.usedFDs = CountFDs(64);
1822             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1823             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1824             tstat.idleThreads = opr_queue_Count(&rx_idleServerQueue);
1825             MUTEX_EXIT(&rx_serverPool_lock);
1826             tstat.idleThreads = htonl(tstat.idleThreads);
1827             tl = sizeof(struct rx_debugStats) - ap->length;
1828             if (tl > 0)
1829                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1830
1831             if (tl <= 0) {
1832                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1833                                (char *)&tstat);
1834                 ap->length = sizeof(struct rx_debugStats);
1835                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1836                 rx_computelen(ap, ap->length);
1837             }
1838             break;
1839         }
1840
1841     case RX_DEBUGI_GETALLCONN:
1842     case RX_DEBUGI_GETCONN:{
1843             unsigned int i, j;
1844             struct rx_connection *tc;
1845             struct rx_call *tcall;
1846             struct rx_debugConn tconn;
1847             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1848
1849
1850             tl = sizeof(struct rx_debugConn) - ap->length;
1851             if (tl > 0)
1852                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1853             if (tl > 0)
1854                 return ap;
1855
1856             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1857             /* get N'th (maybe) "interesting" connection info */
1858             for (i = 0; i < rx_hashTableSize; i++) {
1859 #if !defined(KERNEL)
1860                 /* the time complexity of the algorithm used here
1861                  * exponentially increses with the number of connections.
1862                  */
1863 #ifdef AFS_PTHREAD_ENV
1864                 pthread_yield();
1865 #else
1866                 (void)IOMGR_Poll();
1867 #endif
1868 #endif
1869                 MUTEX_ENTER(&rx_connHashTable_lock);
1870                 /* We might be slightly out of step since we are not
1871                  * locking each call, but this is only debugging output.
1872                  */
1873                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1874                     if ((all || rxi_IsConnInteresting(tc))
1875                         && tin.index-- <= 0) {
1876                         tconn.host = tc->peer->host;
1877                         tconn.port = tc->peer->port;
1878                         tconn.cid = htonl(tc->cid);
1879                         tconn.epoch = htonl(tc->epoch);
1880                         tconn.serial = htonl(tc->serial);
1881                         for (j = 0; j < RX_MAXCALLS; j++) {
1882                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1883                             if ((tcall = tc->call[j])) {
1884                                 tconn.callState[j] = tcall->state;
1885                                 tconn.callMode[j] = tcall->app.mode;
1886                                 tconn.callFlags[j] = tcall->flags;
1887                                 if (!opr_queue_IsEmpty(&tcall->rq))
1888                                     tconn.callOther[j] |= RX_OTHER_IN;
1889                                 if (!opr_queue_IsEmpty(&tcall->tq))
1890                                     tconn.callOther[j] |= RX_OTHER_OUT;
1891                             } else
1892                                 tconn.callState[j] = RX_STATE_NOTINIT;
1893                         }
1894
1895                         tconn.natMTU = htonl(tc->peer->natMTU);
1896                         tconn.error = htonl(tc->error);
1897                         tconn.flags = tc->flags;
1898                         tconn.type = tc->type;
1899                         tconn.securityIndex = tc->securityIndex;
1900                         if (tc->securityObject) {
1901                             RXS_GetStats(tc->securityObject, tc,
1902                                          &tconn.secStats);
1903 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1904 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1905                             DOHTONL(flags);
1906                             DOHTONL(expires);
1907                             DOHTONL(packetsReceived);
1908                             DOHTONL(packetsSent);
1909                             DOHTONL(bytesReceived);
1910                             DOHTONL(bytesSent);
1911                             for (i = 0;
1912                                  i <
1913                                  sizeof(tconn.secStats.spares) /
1914                                  sizeof(short); i++)
1915                                 DOHTONS(spares[i]);
1916                             for (i = 0;
1917                                  i <
1918                                  sizeof(tconn.secStats.sparel) /
1919                                  sizeof(afs_int32); i++)
1920                                 DOHTONL(sparel[i]);
1921                         }
1922
1923                         MUTEX_EXIT(&rx_connHashTable_lock);
1924                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1925                                        (char *)&tconn);
1926                         tl = ap->length;
1927                         ap->length = sizeof(struct rx_debugConn);
1928                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1929                                             istack);
1930                         ap->length = tl;
1931                         return ap;
1932                     }
1933                 }
1934                 MUTEX_EXIT(&rx_connHashTable_lock);
1935             }
1936             /* if we make it here, there are no interesting packets */
1937             tconn.cid = htonl(0xffffffff);      /* means end */
1938             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1939                            (char *)&tconn);
1940             tl = ap->length;
1941             ap->length = sizeof(struct rx_debugConn);
1942             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1943             ap->length = tl;
1944             break;
1945         }
1946
1947         /*
1948          * Pass back all the peer structures we have available
1949          */
1950
1951     case RX_DEBUGI_GETPEER:{
1952             unsigned int i;
1953             struct rx_peer *tp;
1954             struct rx_debugPeer tpeer;
1955
1956
1957             tl = sizeof(struct rx_debugPeer) - ap->length;
1958             if (tl > 0)
1959                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1960             if (tl > 0)
1961                 return ap;
1962
1963             memset(&tpeer, 0, sizeof(tpeer));
1964             for (i = 0; i < rx_hashTableSize; i++) {
1965 #if !defined(KERNEL)
1966                 /* the time complexity of the algorithm used here
1967                  * exponentially increses with the number of peers.
1968                  *
1969                  * Yielding after processing each hash table entry
1970                  * and dropping rx_peerHashTable_lock.
1971                  * also increases the risk that we will miss a new
1972                  * entry - but we are willing to live with this
1973                  * limitation since this is meant for debugging only
1974                  */
1975 #ifdef AFS_PTHREAD_ENV
1976                 pthread_yield();
1977 #else
1978                 (void)IOMGR_Poll();
1979 #endif
1980 #endif
1981                 MUTEX_ENTER(&rx_peerHashTable_lock);
1982                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1983                     if (tin.index-- <= 0) {
1984                         tp->refCount++;
1985                         MUTEX_EXIT(&rx_peerHashTable_lock);
1986
1987                         MUTEX_ENTER(&tp->peer_lock);
1988                         tpeer.host = tp->host;
1989                         tpeer.port = tp->port;
1990                         tpeer.ifMTU = htons(tp->ifMTU);
1991                         tpeer.idleWhen = htonl(tp->idleWhen);
1992                         tpeer.refCount = htons(tp->refCount);
1993                         tpeer.burstSize = 0;
1994                         tpeer.burst = 0;
1995                         tpeer.burstWait.sec = 0;
1996                         tpeer.burstWait.usec = 0;
1997                         tpeer.rtt = htonl(tp->rtt);
1998                         tpeer.rtt_dev = htonl(tp->rtt_dev);
1999                         tpeer.nSent = htonl(tp->nSent);
2000                         tpeer.reSends = htonl(tp->reSends);
2001                         tpeer.natMTU = htons(tp->natMTU);
2002                         tpeer.maxMTU = htons(tp->maxMTU);
2003                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2004                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2005                         tpeer.MTU = htons(tp->MTU);
2006                         tpeer.cwind = htons(tp->cwind);
2007                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2008                         tpeer.congestSeq = htons(tp->congestSeq);
2009                         tpeer.bytesSent.high =
2010                             htonl(tp->bytesSent >> 32);
2011                         tpeer.bytesSent.low =
2012                             htonl(tp->bytesSent & MAX_AFS_UINT32);
2013                         tpeer.bytesReceived.high =
2014                             htonl(tp->bytesReceived >> 32);
2015                         tpeer.bytesReceived.low =
2016                             htonl(tp->bytesReceived & MAX_AFS_UINT32);
2017                         MUTEX_EXIT(&tp->peer_lock);
2018
2019                         MUTEX_ENTER(&rx_peerHashTable_lock);
2020                         tp->refCount--;
2021                         MUTEX_EXIT(&rx_peerHashTable_lock);
2022
2023                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2024                                        (char *)&tpeer);
2025                         tl = ap->length;
2026                         ap->length = sizeof(struct rx_debugPeer);
2027                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2028                                             istack);
2029                         ap->length = tl;
2030                         return ap;
2031                     }
2032                 }
2033                 MUTEX_EXIT(&rx_peerHashTable_lock);
2034             }
2035             /* if we make it here, there are no interesting packets */
2036             tpeer.host = htonl(0xffffffff);     /* means end */
2037             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2038                            (char *)&tpeer);
2039             tl = ap->length;
2040             ap->length = sizeof(struct rx_debugPeer);
2041             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2042             ap->length = tl;
2043             break;
2044         }
2045
2046     case RX_DEBUGI_RXSTATS:{
2047             int i;
2048             afs_int32 *s;
2049
2050             tl = sizeof(rx_stats) - ap->length;
2051             if (tl > 0)
2052                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2053             if (tl > 0)
2054                 return ap;
2055
2056             /* Since its all int32s convert to network order with a loop. */
2057             if (rx_stats_active)
2058                 MUTEX_ENTER(&rx_stats_mutex);
2059             s = (afs_int32 *) & rx_stats;
2060             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2061                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2062
2063             tl = ap->length;
2064             ap->length = sizeof(rx_stats);
2065             if (rx_stats_active)
2066                 MUTEX_EXIT(&rx_stats_mutex);
2067             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2068             ap->length = tl;
2069             break;
2070         }
2071
2072     default:
2073         /* error response packet */
2074         tin.type = htonl(RX_DEBUGI_BADTYPE);
2075         tin.index = tin.type;
2076         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2077         tl = ap->length;
2078         ap->length = sizeof(struct rx_debugIn);
2079         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2080         ap->length = tl;
2081         break;
2082     }
2083     return ap;
2084 }
2085
2086 struct rx_packet *
2087 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2088                          afs_uint32 ahost, short aport, int istack)
2089 {
2090     afs_int32 tl;
2091
2092     /*
2093      * Only respond to client-initiated version requests, and
2094      * clear that flag in the response.
2095      */
2096     if (ap->header.flags & RX_CLIENT_INITIATED) {
2097         char buf[66];
2098
2099         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2100         rxi_EncodePacketHeader(ap);
2101         memset(buf, 0, sizeof(buf));
2102         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2103         rx_packetwrite(ap, 0, 65, buf);
2104         tl = ap->length;
2105         ap->length = 65;
2106         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2107         ap->length = tl;
2108     }
2109
2110     return ap;
2111 }
2112
2113
2114 /* send a debug packet back to the sender */
2115 static void
2116 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2117                     afs_uint32 ahost, short aport, afs_int32 istack)
2118 {
2119     struct sockaddr_in taddr;
2120     unsigned int i, nbytes, savelen = 0;
2121     int saven = 0;
2122 #ifdef KERNEL
2123     int waslocked = ISAFS_GLOCK();
2124 #endif
2125
2126     taddr.sin_family = AF_INET;
2127     taddr.sin_port = aport;
2128     taddr.sin_addr.s_addr = ahost;
2129     memset(&taddr.sin_zero, 0, sizeof(taddr.sin_zero));
2130 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2131     taddr.sin_len = sizeof(struct sockaddr_in);
2132 #endif
2133
2134     /* We need to trim the niovecs. */
2135     nbytes = apacket->length;
2136     for (i = 1; i < apacket->niovecs; i++) {
2137         if (nbytes <= apacket->wirevec[i].iov_len) {
2138             savelen = apacket->wirevec[i].iov_len;
2139             saven = apacket->niovecs;
2140             apacket->wirevec[i].iov_len = nbytes;
2141             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2142         } else
2143             nbytes -= apacket->wirevec[i].iov_len;
2144     }
2145 #ifdef KERNEL
2146 #ifdef RX_KERNEL_TRACE
2147     if (ICL_SETACTIVE(afs_iclSetp)) {
2148         if (!waslocked)
2149             AFS_GLOCK();
2150         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2151                    "before osi_NetSend()");
2152         AFS_GUNLOCK();
2153     }
2154 #else
2155     if (waslocked)
2156         AFS_GUNLOCK();
2157 #endif
2158 #endif
2159     /* debug packets are not reliably delivered, hence the cast below. */
2160     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2161                       apacket->length + RX_HEADER_SIZE, istack);
2162 #ifdef KERNEL
2163 #ifdef RX_KERNEL_TRACE
2164     if (ICL_SETACTIVE(afs_iclSetp)) {
2165         AFS_GLOCK();
2166         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2167                    "after osi_NetSend()");
2168         if (!waslocked)
2169             AFS_GUNLOCK();
2170     }
2171 #else
2172     if (waslocked)
2173         AFS_GLOCK();
2174 #endif
2175 #endif
2176     if (saven) {                /* means we truncated the packet above. */
2177         apacket->wirevec[i - 1].iov_len = savelen;
2178         apacket->niovecs = saven;
2179     }
2180
2181 }
2182
2183 static void
2184 rxi_NetSendError(struct rx_call *call, int code)
2185 {
2186     int down = 0;
2187 #ifdef AFS_NT40_ENV
2188     if (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) {
2189         down = 1;
2190     }
2191     if (code == -WSAEHOSTUNREACH) {
2192         down = 1;
2193     }
2194 #elif defined(AFS_LINUX20_ENV)
2195     if (code == -ENETUNREACH) {
2196         down = 1;
2197     }
2198 #elif defined(AFS_DARWIN_ENV)
2199     if (code == EHOSTUNREACH) {
2200         down = 1;
2201     }
2202 #endif
2203     if (down) {
2204         call->lastReceiveTime = 0;
2205     }
2206 }
2207
2208 /* Send the packet to appropriate destination for the specified
2209  * call.  The header is first encoded and placed in the packet.
2210  */
2211 void
2212 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2213                struct rx_packet *p, int istack)
2214 {
2215 #if defined(KERNEL)
2216     int waslocked;
2217 #endif
2218     int code;
2219     struct sockaddr_in addr;
2220     struct rx_peer *peer = conn->peer;
2221     osi_socket socket;
2222 #ifdef RXDEBUG
2223     char deliveryType = 'S';
2224 #endif
2225     /* The address we're sending the packet to */
2226     memset(&addr, 0, sizeof(addr));
2227     addr.sin_family = AF_INET;
2228     addr.sin_port = peer->port;
2229     addr.sin_addr.s_addr = peer->host;
2230     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2231
2232     /* This stuff should be revamped, I think, so that most, if not
2233      * all, of the header stuff is always added here.  We could
2234      * probably do away with the encode/decode routines. XXXXX */
2235
2236     /* Stamp each packet with a unique serial number.  The serial
2237      * number is maintained on a connection basis because some types
2238      * of security may be based on the serial number of the packet,
2239      * and security is handled on a per authenticated-connection
2240      * basis. */
2241     /* Pre-increment, to guarantee no zero serial number; a zero
2242      * serial number means the packet was never sent. */
2243     MUTEX_ENTER(&conn->conn_data_lock);
2244     p->header.serial = ++conn->serial;
2245     if (p->length > conn->peer->maxPacketSize) {
2246         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2247             (p->header.flags & RX_REQUEST_ACK)) {
2248             conn->lastPingSize = p->length;
2249             conn->lastPingSizeSer = p->header.serial;
2250         } else if (p->header.seq != 0) {
2251             conn->lastPacketSize = p->length;
2252             conn->lastPacketSizeSeq = p->header.seq;
2253         }
2254     }
2255     MUTEX_EXIT(&conn->conn_data_lock);
2256     /* This is so we can adjust retransmit time-outs better in the face of
2257      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2258      */
2259     if (p->firstSerial == 0) {
2260         p->firstSerial = p->header.serial;
2261     }
2262 #ifdef RXDEBUG
2263     /* If an output tracer function is defined, call it with the packet and
2264      * network address.  Note this function may modify its arguments. */
2265     if (rx_almostSent) {
2266         int drop = (*rx_almostSent) (p, &addr);
2267         /* drop packet if return value is non-zero? */
2268         if (drop)
2269             deliveryType = 'D'; /* Drop the packet */
2270     }
2271 #endif
2272
2273     /* Get network byte order header */
2274     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2275                                  * touch ALL the fields */
2276
2277     /* Send the packet out on the same socket that related packets are being
2278      * received on */
2279     socket =
2280         (conn->type ==
2281          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2282
2283 #ifdef RXDEBUG
2284     /* Possibly drop this packet,  for testing purposes */
2285     if ((deliveryType == 'D')
2286         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2287             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2288         deliveryType = 'D';     /* Drop the packet */
2289     } else {
2290         deliveryType = 'S';     /* Send the packet */
2291 #endif /* RXDEBUG */
2292
2293         /* Loop until the packet is sent.  We'd prefer just to use a
2294          * blocking socket, but unfortunately the interface doesn't
2295          * allow us to have the socket block in send mode, and not
2296          * block in receive mode */
2297 #ifdef KERNEL
2298         waslocked = ISAFS_GLOCK();
2299 #ifdef RX_KERNEL_TRACE
2300         if (ICL_SETACTIVE(afs_iclSetp)) {
2301             if (!waslocked)
2302                 AFS_GLOCK();
2303             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2304                        "before osi_NetSend()");
2305             AFS_GUNLOCK();
2306         }
2307 #else
2308         if (waslocked)
2309             AFS_GUNLOCK();
2310 #endif
2311 #endif
2312         if ((code =
2313              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2314                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2315             /* send failed, so let's hurry up the resend, eh? */
2316             if (rx_stats_active)
2317                 rx_atomic_inc(&rx_stats.netSendFailures);
2318             p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2319
2320             /* Some systems are nice and tell us right away that we cannot
2321              * reach this recipient by returning an error code.
2322              * So, when this happens let's "down" the host NOW so
2323              * we don't sit around waiting for this host to timeout later.
2324              */
2325             if (call) {
2326                 rxi_NetSendError(call, code);
2327             }
2328         }
2329 #ifdef KERNEL
2330 #ifdef RX_KERNEL_TRACE
2331         if (ICL_SETACTIVE(afs_iclSetp)) {
2332             AFS_GLOCK();
2333             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2334                        "after osi_NetSend()");
2335             if (!waslocked)
2336                 AFS_GUNLOCK();
2337         }
2338 #else
2339         if (waslocked)
2340             AFS_GLOCK();
2341 #endif
2342 #endif
2343 #ifdef RXDEBUG
2344     }
2345     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2346           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2347           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2348           p->header.seq, p->header.flags, p, p->length));
2349 #endif
2350     if (rx_stats_active) {
2351         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2352         MUTEX_ENTER(&peer->peer_lock);
2353         peer->bytesSent += p->length;
2354         MUTEX_EXIT(&peer->peer_lock);
2355     }
2356 }
2357
2358 /* Send a list of packets to appropriate destination for the specified
2359  * connection.  The headers are first encoded and placed in the packets.
2360  */
2361 void
2362 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2363                    struct rx_packet **list, int len, int istack)
2364 {
2365 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2366     int waslocked;
2367 #endif
2368     struct sockaddr_in addr;
2369     struct rx_peer *peer = conn->peer;
2370     osi_socket socket;
2371     struct rx_packet *p = NULL;
2372     struct iovec wirevec[RX_MAXIOVECS];
2373     int i, length, code;
2374     afs_uint32 serial;
2375     afs_uint32 temp;
2376     struct rx_jumboHeader *jp;
2377 #ifdef RXDEBUG
2378     char deliveryType = 'S';
2379 #endif
2380     /* The address we're sending the packet to */
2381     addr.sin_family = AF_INET;
2382     addr.sin_port = peer->port;
2383     addr.sin_addr.s_addr = peer->host;
2384     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2385
2386     if (len + 1 > RX_MAXIOVECS) {
2387         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2388     }
2389
2390     /*
2391      * Stamp the packets in this jumbogram with consecutive serial numbers
2392      */
2393     MUTEX_ENTER(&conn->conn_data_lock);
2394     serial = conn->serial;
2395     conn->serial += len;
2396     for (i = 0; i < len; i++) {
2397         p = list[i];
2398         /* a ping *or* a sequenced packet can count */
2399         if (p->length > conn->peer->maxPacketSize) {
2400             if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2401                  (p->header.flags & RX_REQUEST_ACK)) &&
2402                 ((i == 0) || (p->length >= conn->lastPingSize))) {
2403                 conn->lastPingSize = p->length;
2404                 conn->lastPingSizeSer = serial + i;
2405             } else if ((p->header.seq != 0) &&
2406                        ((i == 0) || (p->length >= conn->lastPacketSize))) {
2407                 conn->lastPacketSize = p->length;
2408                 conn->lastPacketSizeSeq = p->header.seq;
2409             }
2410         }
2411     }
2412     MUTEX_EXIT(&conn->conn_data_lock);
2413
2414
2415     /* This stuff should be revamped, I think, so that most, if not
2416      * all, of the header stuff is always added here.  We could
2417      * probably do away with the encode/decode routines. XXXXX */
2418
2419     jp = NULL;
2420     length = RX_HEADER_SIZE;
2421     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2422     wirevec[0].iov_len = RX_HEADER_SIZE;
2423     for (i = 0; i < len; i++) {
2424         p = list[i];
2425
2426         /* The whole 3.5 jumbogram scheme relies on packets fitting
2427          * in a single packet buffer. */
2428         if (p->niovecs > 2) {
2429             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2430         }
2431
2432         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2433          * in this chunk.  */
2434         if (i < len - 1) {
2435             if (p->length != RX_JUMBOBUFFERSIZE) {
2436                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2437             }
2438             p->header.flags |= RX_JUMBO_PACKET;
2439             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2440             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2441         } else {
2442             wirevec[i + 1].iov_len = p->length;
2443             length += p->length;
2444         }
2445         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2446         if (jp != NULL) {
2447             /* Convert jumbo packet header to network byte order */
2448             temp = (afs_uint32) (p->header.flags) << 24;
2449             temp |= (afs_uint32) (p->header.spare);
2450             *(afs_uint32 *) jp = htonl(temp);
2451         }
2452         jp = (struct rx_jumboHeader *)
2453             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2454
2455         /* Stamp each packet with a unique serial number.  The serial
2456          * number is maintained on a connection basis because some types
2457          * of security may be based on the serial number of the packet,
2458          * and security is handled on a per authenticated-connection
2459          * basis. */
2460         /* Pre-increment, to guarantee no zero serial number; a zero
2461          * serial number means the packet was never sent. */
2462         p->header.serial = ++serial;
2463         /* This is so we can adjust retransmit time-outs better in the face of
2464          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2465          */
2466         if (p->firstSerial == 0) {
2467             p->firstSerial = p->header.serial;
2468         }
2469 #ifdef RXDEBUG
2470         /* If an output tracer function is defined, call it with the packet and
2471          * network address.  Note this function may modify its arguments. */
2472         if (rx_almostSent) {
2473             int drop = (*rx_almostSent) (p, &addr);
2474             /* drop packet if return value is non-zero? */
2475             if (drop)
2476                 deliveryType = 'D';     /* Drop the packet */
2477         }
2478 #endif
2479
2480         /* Get network byte order header */
2481         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2482                                          * touch ALL the fields */
2483     }
2484
2485     /* Send the packet out on the same socket that related packets are being
2486      * received on */
2487     socket =
2488         (conn->type ==
2489          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2490
2491 #ifdef RXDEBUG
2492     /* Possibly drop this packet,  for testing purposes */
2493     if ((deliveryType == 'D')
2494         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2495             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2496         deliveryType = 'D';     /* Drop the packet */
2497     } else {
2498         deliveryType = 'S';     /* Send the packet */
2499 #endif /* RXDEBUG */
2500
2501         /* Loop until the packet is sent.  We'd prefer just to use a
2502          * blocking socket, but unfortunately the interface doesn't
2503          * allow us to have the socket block in send mode, and not
2504          * block in receive mode */
2505 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2506         waslocked = ISAFS_GLOCK();
2507         if (!istack && waslocked)
2508             AFS_GUNLOCK();
2509 #endif
2510         if ((code =
2511              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2512                          istack)) != 0) {
2513             /* send failed, so let's hurry up the resend, eh? */
2514             if (rx_stats_active)
2515                 rx_atomic_inc(&rx_stats.netSendFailures);
2516             for (i = 0; i < len; i++) {
2517                 p = list[i];
2518                 p->flags &= ~RX_PKTFLAG_SENT;  /* resend it very soon */
2519             }
2520             /* Some systems are nice and tell us right away that we cannot
2521              * reach this recipient by returning an error code.
2522              * So, when this happens let's "down" the host NOW so
2523              * we don't sit around waiting for this host to timeout later.
2524              */
2525             if (call) {
2526                 rxi_NetSendError(call, code);
2527             }
2528         }
2529 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2530         if (!istack && waslocked)
2531             AFS_GLOCK();
2532 #endif
2533 #ifdef RXDEBUG
2534     }
2535
2536     osi_Assert(p != NULL);
2537
2538     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2539           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2540           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2541           p->header.seq, p->header.flags, p, p->length));
2542
2543 #endif
2544     if (rx_stats_active) {
2545         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2546         MUTEX_ENTER(&peer->peer_lock);
2547         peer->bytesSent += p->length;
2548         MUTEX_EXIT(&peer->peer_lock);
2549     }
2550 }
2551
2552 /* Send a raw abort packet, without any call or connection structures */
2553 void
2554 rxi_SendRawAbort(osi_socket socket, afs_uint32 host, u_short port,
2555                  afs_int32 error, struct rx_packet *source, int istack)
2556 {
2557     struct rx_header theader;
2558     struct sockaddr_in addr;
2559     struct iovec iov[2];
2560
2561     memset(&theader, 0, sizeof(theader));
2562     theader.epoch = htonl(source->header.epoch);
2563     theader.callNumber = htonl(source->header.callNumber);
2564     theader.serial = htonl(1);
2565     theader.type = RX_PACKET_TYPE_ABORT;
2566     theader.serviceId = htons(source->header.serviceId);
2567     theader.securityIndex = source->header.securityIndex;
2568     theader.cid = htonl(source->header.cid);
2569
2570     /*
2571      * If the abort is being sent in response to a server initiated packet,
2572      * set client_initiated in the abort to ensure it is not associated by
2573      * the receiver with a connection in the opposite direction.
2574      */
2575     if ((source->header.flags & RX_CLIENT_INITIATED) != RX_CLIENT_INITIATED)
2576         theader.flags |= RX_CLIENT_INITIATED;
2577
2578     error = htonl(error);
2579
2580     iov[0].iov_base = &theader;
2581     iov[0].iov_len = sizeof(struct rx_header);
2582     iov[1].iov_base = &error;
2583     iov[1].iov_len = sizeof(error);
2584
2585     addr.sin_family = AF_INET;
2586     addr.sin_addr.s_addr = host;
2587     addr.sin_port = port;
2588     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2589 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2590     addr.sin_len = sizeof(struct sockaddr_in);
2591 #endif
2592
2593     osi_NetSend(socket, &addr, iov, 2,
2594                 sizeof(struct rx_header) + sizeof(error), istack);
2595 }
2596
2597 /* Send a "special" packet to the peer connection.  If call is
2598  * specified, then the packet is directed to a specific call channel
2599  * associated with the connection, otherwise it is directed to the
2600  * connection only. Uses optionalPacket if it is supplied, rather than
2601  * allocating a new packet buffer.  Nbytes is the length of the data
2602  * portion of the packet.  If data is non-null, nbytes of data are
2603  * copied into the packet.  Type is the type of the packet, as defined
2604  * in rx.h.  Bug: there's a lot of duplication between this and other
2605  * routines.  This needs to be cleaned up. */
2606 struct rx_packet *
2607 rxi_SendSpecial(struct rx_call *call,
2608                 struct rx_connection *conn,
2609                 struct rx_packet *optionalPacket, int type, char *data,
2610                 int nbytes, int istack)
2611 {
2612     /* Some of the following stuff should be common code for all
2613      * packet sends (it's repeated elsewhere) */
2614     struct rx_packet *p;
2615     unsigned int i = 0;
2616     int savelen = 0, saven = 0;
2617     int channel, callNumber;
2618     if (call) {
2619         channel = call->channel;
2620         callNumber = *call->callNumber;
2621         /* BUSY packets refer to the next call on this connection */
2622         if (type == RX_PACKET_TYPE_BUSY) {
2623             callNumber++;
2624         }
2625     } else {
2626         channel = 0;
2627         callNumber = 0;
2628     }
2629     p = optionalPacket;
2630     if (!p) {
2631         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2632         if (!p)
2633             osi_Panic("rxi_SendSpecial failure");
2634     }
2635
2636     if (nbytes != -1)
2637         p->length = nbytes;
2638     else
2639         nbytes = p->length;
2640     p->header.serviceId = conn->serviceId;
2641     p->header.securityIndex = conn->securityIndex;
2642     p->header.cid = (conn->cid | channel);
2643     p->header.callNumber = callNumber;
2644     p->header.seq = 0;
2645     p->header.epoch = conn->epoch;
2646     p->header.type = type;
2647     p->header.flags = 0;
2648     if (conn->type == RX_CLIENT_CONNECTION)
2649         p->header.flags |= RX_CLIENT_INITIATED;
2650     if (data)
2651         rx_packetwrite(p, 0, nbytes, data);
2652
2653     for (i = 1; i < p->niovecs; i++) {
2654         if (nbytes <= p->wirevec[i].iov_len) {
2655             savelen = p->wirevec[i].iov_len;
2656             saven = p->niovecs;
2657             p->wirevec[i].iov_len = nbytes;
2658             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2659         } else
2660             nbytes -= p->wirevec[i].iov_len;
2661     }
2662
2663     if (call)
2664         rxi_Send(call, p, istack);
2665     else
2666         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2667     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2668         /* really need to do this, but it seems safer this way, given that  */
2669         /* sneaky optionalPacket... */
2670         p->wirevec[i - 1].iov_len = savelen;
2671         p->niovecs = saven;
2672     }
2673     if (!optionalPacket)
2674         rxi_FreePacket(p);
2675     return optionalPacket;
2676 }
2677
2678
2679 /* Encode the packet's header (from the struct header in the packet to
2680  * the net byte order representation in the wire representation of the
2681  * packet, which is what is actually sent out on the wire) */
2682 void
2683 rxi_EncodePacketHeader(struct rx_packet *p)
2684 {
2685     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2686
2687     memset(buf, 0, RX_HEADER_SIZE);
2688     *buf++ = htonl(p->header.epoch);
2689     *buf++ = htonl(p->header.cid);
2690     *buf++ = htonl(p->header.callNumber);
2691     *buf++ = htonl(p->header.seq);
2692     *buf++ = htonl(p->header.serial);
2693     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2694                    | (((afs_uint32) p->header.flags) << 16)
2695                    | (p->header.userStatus << 8) | p->header.securityIndex);
2696     /* Note: top 16 bits of this next word were reserved */
2697     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2698 }
2699
2700 /* Decode the packet's header (from net byte order to a struct header) */
2701 void
2702 rxi_DecodePacketHeader(struct rx_packet *p)
2703 {
2704     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2705     afs_uint32 temp;
2706
2707     p->header.epoch = ntohl(*buf);
2708     buf++;
2709     p->header.cid = ntohl(*buf);
2710     buf++;
2711     p->header.callNumber = ntohl(*buf);
2712     buf++;
2713     p->header.seq = ntohl(*buf);
2714     buf++;
2715     p->header.serial = ntohl(*buf);
2716     buf++;
2717
2718     temp = ntohl(*buf);
2719     buf++;
2720
2721     /* C will truncate byte fields to bytes for me */
2722     p->header.type = temp >> 24;
2723     p->header.flags = temp >> 16;
2724     p->header.userStatus = temp >> 8;
2725     p->header.securityIndex = temp >> 0;
2726
2727     temp = ntohl(*buf);
2728     buf++;
2729
2730     p->header.serviceId = (temp & 0xffff);
2731     p->header.spare = temp >> 16;
2732     /* Note: top 16 bits of this last word are the security checksum */
2733 }
2734
2735 /*
2736  * LOCKS HELD: called with call->lock held.
2737  *
2738  * PrepareSendPacket is the only place in the code that
2739  * can increment call->tnext.  This could become an atomic
2740  * in the future.  Beyond that there is nothing in this
2741  * function that requires the call being locked.  This
2742  * function can only be called by the application thread.
2743  */
2744 void
2745 rxi_PrepareSendPacket(struct rx_call *call,
2746                       struct rx_packet *p, int last)
2747 {
2748     struct rx_connection *conn = call->conn;
2749     afs_uint32 seq = call->tnext++;
2750     unsigned int i;
2751     afs_int32 len;              /* len must be a signed type; it can go negative */
2752     int code;
2753
2754     /* No data packets on call 0. Where do these come from? */
2755     if (*call->callNumber == 0)
2756         *call->callNumber = 1;
2757
2758     MUTEX_EXIT(&call->lock);
2759     p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2760
2761     p->header.cid = (conn->cid | call->channel);
2762     p->header.serviceId = conn->serviceId;
2763     p->header.securityIndex = conn->securityIndex;
2764
2765     p->header.callNumber = *call->callNumber;
2766     p->header.seq = seq;
2767     p->header.epoch = conn->epoch;
2768     p->header.type = RX_PACKET_TYPE_DATA;
2769     p->header.flags = 0;
2770     p->header.spare = 0;
2771     if (conn->type == RX_CLIENT_CONNECTION)
2772         p->header.flags |= RX_CLIENT_INITIATED;
2773
2774     if (last)
2775         p->header.flags |= RX_LAST_PACKET;
2776
2777     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2778     p->header.serial = 0;       /* Another way of saying never transmitted... */
2779
2780     /* Now that we're sure this is the last data on the call, make sure
2781      * that the "length" and the sum of the iov_lens matches. */
2782     len = p->length + call->conn->securityHeaderSize;
2783
2784     for (i = 1; i < p->niovecs && len > 0; i++) {
2785         len -= p->wirevec[i].iov_len;
2786     }
2787     if (len > 0) {
2788         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2789     } else if (i < p->niovecs) {
2790         /* Free any extra elements in the wirevec */
2791 #if defined(RX_ENABLE_TSFPQ)
2792         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2793 #else /* !RX_ENABLE_TSFPQ */
2794         MUTEX_ENTER(&rx_freePktQ_lock);
2795         rxi_FreeDataBufsNoLock(p, i);
2796         MUTEX_EXIT(&rx_freePktQ_lock);
2797 #endif /* !RX_ENABLE_TSFPQ */
2798
2799         p->niovecs = i;
2800     }
2801     if (len)
2802         p->wirevec[i - 1].iov_len += len;
2803     MUTEX_ENTER(&call->lock);
2804     code = RXS_PreparePacket(conn->securityObject, call, p);
2805     if (code) {
2806         MUTEX_EXIT(&call->lock);
2807         rxi_ConnectionError(conn, code);
2808         MUTEX_ENTER(&conn->conn_data_lock);
2809         p = rxi_SendConnectionAbort(conn, p, 0, 0);
2810         MUTEX_EXIT(&conn->conn_data_lock);
2811         MUTEX_ENTER(&call->lock);
2812         /* setting a connection error means all calls for that conn are also
2813          * error'd. if this call does not have an error by now, something is
2814          * very wrong, and we risk sending data in the clear that is supposed
2815          * to be encrypted. */
2816         osi_Assert(call->error);
2817     }
2818 }
2819
2820 /* Given an interface MTU size, calculate an adjusted MTU size that
2821  * will make efficient use of the RX buffers when the peer is sending
2822  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2823 int
2824 rxi_AdjustIfMTU(int mtu)
2825 {
2826     int adjMTU;
2827     int frags;
2828
2829     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2830         return mtu;
2831     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2832     if (mtu <= adjMTU) {
2833         return mtu;
2834     }
2835     mtu -= adjMTU;
2836     if (mtu <= 0) {
2837         return adjMTU;
2838     }
2839     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2840     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2841 }
2842
2843 /* Given an interface MTU size, and the peer's advertised max receive
2844  * size, calculate an adjisted maxMTU size that makes efficient use
2845  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2846 int
2847 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2848 {
2849     int maxMTU = mtu * rxi_nSendFrags;
2850     maxMTU = MIN(maxMTU, peerMaxMTU);
2851     return rxi_AdjustIfMTU(maxMTU);
2852 }
2853
2854 /* Given a packet size, figure out how many datagram packet will fit.
2855  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2856  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2857  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2858 int
2859 rxi_AdjustDgramPackets(int frags, int mtu)
2860 {
2861     int maxMTU;
2862     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2863         return 1;
2864     }
2865     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2866     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2867     /* subtract the size of the first and last packets */
2868     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2869     if (maxMTU < 0) {
2870         return 1;
2871     }
2872     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2873 }
2874
2875 #ifndef KERNEL
2876 /*
2877  * This function can be used by the Windows Cache Manager
2878  * to dump the list of all rx packets so that we can determine
2879  * where the packet leakage is.
2880  */
2881 int rx_DumpPackets(FILE *outputFile, char *cookie)
2882 {
2883 #ifdef RXDEBUG_PACKET
2884     struct rx_packet *p;
2885 #ifdef AFS_NT40_ENV
2886     int zilch;
2887     char output[2048];
2888 #define RXDPRINTF sprintf
2889 #define RXDPRINTOUT output
2890 #else
2891 #define RXDPRINTF fprintf
2892 #define RXDPRINTOUT outputFile
2893 #endif
2894
2895     NETPRI;
2896     MUTEX_ENTER(&rx_freePktQ_lock);
2897     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2898 #ifdef AFS_NT40_ENV
2899     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2900 #endif
2901
2902     for (p = rx_mallocedP; p; p = p->allNextp) {
2903         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2904                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2905                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2906                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2907                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2908                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2909 #ifdef AFS_NT40_ENV
2910         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2911 #endif
2912     }
2913
2914     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2915 #ifdef AFS_NT40_ENV
2916     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2917 #endif
2918
2919     MUTEX_EXIT(&rx_freePktQ_lock);
2920     USERPRI;
2921 #endif /* RXDEBUG_PACKET */
2922     return 0;
2923 }
2924 #endif