arch/tile/lib/memset_32.c

   1 /*
   2  * Copyright 2010 Tilera Corporation. All Rights Reserved.
   3  *
   4  *   This program is free software; you can redistribute it and/or
   5  *   modify it under the terms of the GNU General Public License
   6  *   as published by the Free Software Foundation, version 2.
   7  *
   8  *   This program is distributed in the hope that it will be useful, but
   9  *   WITHOUT ANY WARRANTY; without even the implied warranty of
  10  *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  11  *   NON INFRINGEMENT.  See the GNU General Public License for
  12  *   more details.
  13  */
  14
  15 #include <arch/chip.h>
  16
  17 #include <linux/types.h>
  18 #include <linux/string.h>
  19 #include <linux/module.h>
  20
  21
  22 void *memset(void *s, int c, size_t n)
  23 {
  24         uint32_t *out32;
  25         int n32;
  26         uint32_t v16, v32;
  27         uint8_t *out8 = s;
  28 #if !CHIP_HAS_WH64()
  29         int ahead32;
  30 #else
  31         int to_align32;
  32 #endif
  33
  34         /* Experimentation shows that a trivial tight loop is a win up until
  35          * around a size of 20, where writing a word at a time starts to win.
  36          */
  37 #define BYTE_CUTOFF 20
  38
  39 #if BYTE_CUTOFF < 3
  40         /* This must be at least at least this big, or some code later
  41          * on doesn't work.
  42          */
  43 #error "BYTE_CUTOFF is too small"
  44 #endif
  45
  46         if (n < BYTE_CUTOFF) {
  47                 /* Strangely, this turns out to be the tightest way to
  48                  * write this loop.
  49                  */
  50                 if (n != 0) {
  51                         do {
  52                                 /* Strangely, combining these into one line
  53                                  * performs worse.
  54                                  */
  55                                 *out8 = c;
  56                                 out8++;
  57                         } while (--n != 0);
  58                 }
  59
  60                 return s;
  61         }
  62
  63 #if !CHIP_HAS_WH64()
  64         /* Use a spare issue slot to start prefetching the first cache
  65          * line early. This instruction is free as the store can be buried
  66          * in otherwise idle issue slots doing ALU ops.
  67          */
  68         __insn_prefetch(out8);
  69
  70         /* We prefetch the end so that a short memset that spans two cache
  71          * lines gets some prefetching benefit. Again we believe this is free
  72          * to issue.
  73          */
  74         __insn_prefetch(&out8[n - 1]);
  75 #endif /* !CHIP_HAS_WH64() */
  76
  77
  78         /* Align 'out8'. We know n >= 3 so this won't write past the end. */
  79         while (((uintptr_t) out8 & 3) != 0) {
  80                 *out8++ = c;
  81                 --n;
  82         }
  83
  84         /* Align 'n'. */
  85         while (n & 3)
  86                 out8[--n] = c;
  87
  88         out32 = (uint32_t *) out8;
  89         n32 = n >> 2;
  90
  91         /* Tile input byte out to 32 bits. */
  92         v16 = __insn_intlb(c, c);
  93         v32 = __insn_intlh(v16, v16);
  94
  95         /* This must be at least 8 or the following loop doesn't work. */
  96 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
  97
  98 #if !CHIP_HAS_WH64()
  99
 100         ahead32 = CACHE_LINE_SIZE_IN_WORDS;
 101
 102         /* We already prefetched the first and last cache lines, so
 103          * we only need to do more prefetching if we are storing
 104          * to more than two cache lines.
 105          */
 106         if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
 107                 int i;
 108
 109                 /* Prefetch the next several cache lines.
 110                  * This is the setup code for the software-pipelined
 111                  * loop below.
 112                  */
 113 #define MAX_PREFETCH 5
 114                 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
 115                 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
 116                         ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
 117
 118                 for (i = CACHE_LINE_SIZE_IN_WORDS;
 119                      i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
 120                         __insn_prefetch(&out32[i]);
 121         }
 122
 123         if (n32 > ahead32) {
 124                 while (1) {
 125                         int j;
 126
 127                         /* Prefetch by reading one word several cache lines
 128                          * ahead.  Since loads are non-blocking this will
 129                          * cause the full cache line to be read while we are
 130                          * finishing earlier cache lines.  Using a store
 131                          * here causes microarchitectural performance
 132                          * problems where a victimizing store miss goes to
 133                          * the head of the retry FIFO and locks the pipe for
 134                          * a few cycles.  So a few subsequent stores in this
 135                          * loop go into the retry FIFO, and then later
 136                          * stores see other stores to the same cache line
 137                          * are already in the retry FIFO and themselves go
 138                          * into the retry FIFO, filling it up and grinding
 139                          * to a halt waiting for the original miss to be
 140                          * satisfied.
 141                          */
 142                         __insn_prefetch(&out32[ahead32]);
 143
 144 #if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
 145 #error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
 146 #endif
 147
 148                         n32 -= CACHE_LINE_SIZE_IN_WORDS;
 149
 150                         /* Save icache space by only partially unrolling
 151                          * this loop.
 152                          */
 153                         for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
 154                                 *out32++ = v32;
 155                                 *out32++ = v32;
 156                                 *out32++ = v32;
 157                                 *out32++ = v32;
 158                         }
 159
 160                         /* To save compiled code size, reuse this loop even
 161                          * when we run out of prefetching to do by dropping
 162                          * ahead32 down.
 163                          */
 164                         if (n32 <= ahead32) {
 165                                 /* Not even a full cache line left,
 166                                  * so stop now.
 167                                  */
 168                                 if (n32 < CACHE_LINE_SIZE_IN_WORDS)
 169                                         break;
 170
 171                                 /* Choose a small enough value that we don't
 172                                  * prefetch past the end.  There's no sense
 173                                  * in touching cache lines we don't have to.
 174                                  */
 175                                 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
 176                         }
 177                 }
 178         }
 179
 180 #else /* CHIP_HAS_WH64() */
 181
 182         /* Determine how many words we need to emit before the 'out32'
 183          * pointer becomes aligned modulo the cache line size.
 184          */
 185         to_align32 =
 186                 (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
 187
 188         /* Only bother aligning and using wh64 if there is at least
 189          * one full cache line to process.  This check also prevents
 190          * overrunning the end of the buffer with alignment words.
 191          */
 192         if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
 193                 int lines_left;
 194
 195                 /* Align out32 mod the cache line size so we can use wh64. */
 196                 n32 -= to_align32;
 197                 for (; to_align32 != 0; to_align32--) {
 198                         *out32 = v32;
 199                         out32++;
 200                 }
 201
 202                 /* Use unsigned divide to turn this into a right shift. */
 203                 lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
 204
 205                 do {
 206                         /* Only wh64 a few lines at a time, so we don't
 207                          * exceed the maximum number of victim lines.
 208                          */
 209                         int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
 210                                   ? lines_left
 211                                   : CHIP_MAX_OUTSTANDING_VICTIMS());
 212                         uint32_t *wh = out32;
 213                         int i = x;
 214                         int j;
 215
 216                         lines_left -= x;
 217
 218                         do {
 219                                 __insn_wh64(wh);
 220                                 wh += CACHE_LINE_SIZE_IN_WORDS;
 221                         } while (--i);
 222
 223                         for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4);
 224                              j != 0; j--) {
 225                                 *out32++ = v32;
 226                                 *out32++ = v32;
 227                                 *out32++ = v32;
 228                                 *out32++ = v32;
 229                         }
 230                 } while (lines_left != 0);
 231
 232                 /* We processed all full lines above, so only this many
 233                  * words remain to be processed.
 234                  */
 235                 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 236         }
 237
 238 #endif /* CHIP_HAS_WH64() */
 239
 240         /* Now handle any leftover values. */
 241         if (n32 != 0) {
 242                 do {
 243                         *out32 = v32;
 244                         out32++;
 245                 } while (--n32 != 0);
 246         }
 247
 248         return s;
 249 }
 250 EXPORT_SYMBOL(memset);