include/asm-x86/xor_64.h

   1 /*
   2  * Optimized RAID-5 checksumming functions for MMX and SSE.
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2, or (at your option)
   7  * any later version.
   8  *
   9  * You should have received a copy of the GNU General Public License
  10  * (for example /usr/src/linux/COPYING); if not, write to the Free
  11  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  12  */
  13
  14
  15 /*
  16  * Cache avoiding checksumming functions utilizing KNI instructions
  17  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  18  */
  19
  20 /*
  21  * Based on
  22  * High-speed RAID5 checksumming functions utilizing SSE instructions.
  23  * Copyright (C) 1998 Ingo Molnar.
  24  */
  25
  26 /*
  27  * x86-64 changes / gcc fixes from Andi Kleen.
  28  * Copyright 2002 Andi Kleen, SuSE Labs.
  29  *
  30  * This hasn't been optimized for the hammer yet, but there are likely
  31  * no advantages to be gotten from x86-64 here anyways.
  32  */
  33
  34 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
  35
  36 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
  37    tell it to do a clts before the register saving. */
  38 #define XMMS_SAVE do {                          \
  39         preempt_disable();                      \
  40         asm volatile (                          \
  41                 "movq %%cr0,%0          ;\n\t"  \
  42                 "clts                   ;\n\t"  \
  43                 "movups %%xmm0,(%1)     ;\n\t"  \
  44                 "movups %%xmm1,0x10(%1) ;\n\t"  \
  45                 "movups %%xmm2,0x20(%1) ;\n\t"  \
  46                 "movups %%xmm3,0x30(%1) ;\n\t"  \
  47                 : "=&r" (cr0)                   \
  48                 : "r" (xmm_save)                \
  49                 : "memory");                    \
  50 } while(0)
  51
  52 #define XMMS_RESTORE do {                       \
  53         asm volatile (                          \
  54                 "sfence                 ;\n\t"  \
  55                 "movups (%1),%%xmm0     ;\n\t"  \
  56                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
  57                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
  58                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
  59                 "movq   %0,%%cr0        ;\n\t"  \
  60                 :                               \
  61                 : "r" (cr0), "r" (xmm_save)     \
  62                 : "memory");                    \
  63         preempt_enable();                       \
  64 } while(0)
  65
  66 #define OFFS(x)         "16*("#x")"
  67 #define PF_OFFS(x)      "256+16*("#x")"
  68 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  69 #define LD(x,y)         "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
  70 #define ST(x,y)         "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
  71 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  72 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  73 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  74 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  75 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
  76 #define XO1(x,y)        "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
  77 #define XO2(x,y)        "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
  78 #define XO3(x,y)        "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
  79 #define XO4(x,y)        "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
  80 #define XO5(x,y)        "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
  81
  82
  83 static void
  84 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  85 {
  86         unsigned int lines = bytes >> 8;
  87         unsigned long cr0;
  88         xmm_store_t xmm_save[4];
  89
  90         XMMS_SAVE;
  91
  92         asm volatile (
  93 #undef BLOCK
  94 #define BLOCK(i) \
  95                 LD(i,0)                                 \
  96                         LD(i+1,1)                       \
  97                 PF1(i)                                  \
  98                                 PF1(i+2)                \
  99                                 LD(i+2,2)               \
 100                                         LD(i+3,3)       \
 101                 PF0(i+4)                                \
 102                                 PF0(i+6)                \
 103                 XO1(i,0)                                \
 104                         XO1(i+1,1)                      \
 105                                 XO1(i+2,2)              \
 106                                         XO1(i+3,3)      \
 107                 ST(i,0)                                 \
 108                         ST(i+1,1)                       \
 109                                 ST(i+2,2)               \
 110                                         ST(i+3,3)       \
 111
 112
 113                 PF0(0)
 114                                 PF0(2)
 115
 116         " .align 32                     ;\n"
 117         " 1:                            ;\n"
 118
 119                 BLOCK(0)
 120                 BLOCK(4)
 121                 BLOCK(8)
 122                 BLOCK(12)
 123
 124         "       addq %[inc], %[p1]           ;\n"
 125         "       addq %[inc], %[p2]           ;\n"
 126                 "               decl %[cnt] ; jnz 1b"
 127         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
 128         : [inc] "r" (256UL)
 129         : "memory");
 130
 131         XMMS_RESTORE;
 132 }
 133
 134 static void
 135 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 136           unsigned long *p3)
 137 {
 138         unsigned int lines = bytes >> 8;
 139         xmm_store_t xmm_save[4];
 140         unsigned long cr0;
 141
 142         XMMS_SAVE;
 143
 144         __asm__ __volatile__ (
 145 #undef BLOCK
 146 #define BLOCK(i) \
 147                 PF1(i)                                  \
 148                                 PF1(i+2)                \
 149                 LD(i,0)                                 \
 150                         LD(i+1,1)                       \
 151                                 LD(i+2,2)               \
 152                                         LD(i+3,3)       \
 153                 PF2(i)                                  \
 154                                 PF2(i+2)                \
 155                 PF0(i+4)                                \
 156                                 PF0(i+6)                \
 157                 XO1(i,0)                                \
 158                         XO1(i+1,1)                      \
 159                                 XO1(i+2,2)              \
 160                                         XO1(i+3,3)      \
 161                 XO2(i,0)                                \
 162                         XO2(i+1,1)                      \
 163                                 XO2(i+2,2)              \
 164                                         XO2(i+3,3)      \
 165                 ST(i,0)                                 \
 166                         ST(i+1,1)                       \
 167                                 ST(i+2,2)               \
 168                                         ST(i+3,3)       \
 169
 170
 171                 PF0(0)
 172                                 PF0(2)
 173
 174         " .align 32                     ;\n"
 175         " 1:                            ;\n"
 176
 177                 BLOCK(0)
 178                 BLOCK(4)
 179                 BLOCK(8)
 180                 BLOCK(12)
 181
 182         "       addq %[inc], %[p1]           ;\n"
 183         "       addq %[inc], %[p2]          ;\n"
 184         "       addq %[inc], %[p3]           ;\n"
 185                 "               decl %[cnt] ; jnz 1b"
 186         : [cnt] "+r" (lines),
 187           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 188         : [inc] "r" (256UL)
 189         : "memory");
 190         XMMS_RESTORE;
 191 }
 192
 193 static void
 194 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 195           unsigned long *p3, unsigned long *p4)
 196 {
 197         unsigned int lines = bytes >> 8;
 198         xmm_store_t xmm_save[4];
 199         unsigned long cr0;
 200
 201         XMMS_SAVE;
 202
 203         __asm__ __volatile__ (
 204 #undef BLOCK
 205 #define BLOCK(i) \
 206                 PF1(i)                                  \
 207                                 PF1(i+2)                \
 208                 LD(i,0)                                 \
 209                         LD(i+1,1)                       \
 210                                 LD(i+2,2)               \
 211                                         LD(i+3,3)       \
 212                 PF2(i)                                  \
 213                                 PF2(i+2)                \
 214                 XO1(i,0)                                \
 215                         XO1(i+1,1)                      \
 216                                 XO1(i+2,2)              \
 217                                         XO1(i+3,3)      \
 218                 PF3(i)                                  \
 219                                 PF3(i+2)                \
 220                 PF0(i+4)                                \
 221                                 PF0(i+6)                \
 222                 XO2(i,0)                                \
 223                         XO2(i+1,1)                      \
 224                                 XO2(i+2,2)              \
 225                                         XO2(i+3,3)      \
 226                 XO3(i,0)                                \
 227                         XO3(i+1,1)                      \
 228                                 XO3(i+2,2)              \
 229                                         XO3(i+3,3)      \
 230                 ST(i,0)                                 \
 231                         ST(i+1,1)                       \
 232                                 ST(i+2,2)               \
 233                                         ST(i+3,3)       \
 234
 235
 236                 PF0(0)
 237                                 PF0(2)
 238
 239         " .align 32                     ;\n"
 240         " 1:                            ;\n"
 241
 242                 BLOCK(0)
 243                 BLOCK(4)
 244                 BLOCK(8)
 245                 BLOCK(12)
 246
 247         "       addq %[inc], %[p1]           ;\n"
 248         "       addq %[inc], %[p2]           ;\n"
 249         "       addq %[inc], %[p3]           ;\n"
 250         "       addq %[inc], %[p4]           ;\n"
 251         "       decl %[cnt] ; jnz 1b"
 252         : [cnt] "+c" (lines),
 253           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 254         : [inc] "r" (256UL)
 255         : "memory" );
 256
 257         XMMS_RESTORE;
 258 }
 259
 260 static void
 261 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 262           unsigned long *p3, unsigned long *p4, unsigned long *p5)
 263 {
 264         unsigned int lines = bytes >> 8;
 265         xmm_store_t xmm_save[4];
 266         unsigned long cr0;
 267
 268         XMMS_SAVE;
 269
 270         __asm__ __volatile__ (
 271 #undef BLOCK
 272 #define BLOCK(i) \
 273                 PF1(i)                                  \
 274                                 PF1(i+2)                \
 275                 LD(i,0)                                 \
 276                         LD(i+1,1)                       \
 277                                 LD(i+2,2)               \
 278                                         LD(i+3,3)       \
 279                 PF2(i)                                  \
 280                                 PF2(i+2)                \
 281                 XO1(i,0)                                \
 282                         XO1(i+1,1)                      \
 283                                 XO1(i+2,2)              \
 284                                         XO1(i+3,3)      \
 285                 PF3(i)                                  \
 286                                 PF3(i+2)                \
 287                 XO2(i,0)                                \
 288                         XO2(i+1,1)                      \
 289                                 XO2(i+2,2)              \
 290                                         XO2(i+3,3)      \
 291                 PF4(i)                                  \
 292                                 PF4(i+2)                \
 293                 PF0(i+4)                                \
 294                                 PF0(i+6)                \
 295                 XO3(i,0)                                \
 296                         XO3(i+1,1)                      \
 297                                 XO3(i+2,2)              \
 298                                         XO3(i+3,3)      \
 299                 XO4(i,0)                                \
 300                         XO4(i+1,1)                      \
 301                                 XO4(i+2,2)              \
 302                                         XO4(i+3,3)      \
 303                 ST(i,0)                                 \
 304                         ST(i+1,1)                       \
 305                                 ST(i+2,2)               \
 306                                         ST(i+3,3)       \
 307
 308
 309                 PF0(0)
 310                                 PF0(2)
 311
 312         " .align 32                     ;\n"
 313         " 1:                            ;\n"
 314
 315                 BLOCK(0)
 316                 BLOCK(4)
 317                 BLOCK(8)
 318                 BLOCK(12)
 319
 320         "       addq %[inc], %[p1]           ;\n"
 321         "       addq %[inc], %[p2]           ;\n"
 322         "       addq %[inc], %[p3]           ;\n"
 323         "       addq %[inc], %[p4]           ;\n"
 324         "       addq %[inc], %[p5]           ;\n"
 325         "       decl %[cnt] ; jnz 1b"
 326         : [cnt] "+c" (lines),
 327           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
 328           [p5] "+r" (p5)
 329         : [inc] "r" (256UL)
 330         : "memory");
 331
 332         XMMS_RESTORE;
 333 }
 334
 335 static struct xor_block_template xor_block_sse = {
 336         .name = "generic_sse",
 337         .do_2 = xor_sse_2,
 338         .do_3 = xor_sse_3,
 339         .do_4 = xor_sse_4,
 340         .do_5 = xor_sse_5,
 341 };
 342
 343 #undef XOR_TRY_TEMPLATES
 344 #define XOR_TRY_TEMPLATES                               \
 345         do {                                            \
 346                 xor_speed(&xor_block_sse);      \
 347         } while (0)
 348
 349 /* We force the use of the SSE xor block because it can write around L2.
 350    We may also be able to load into the L1 only depending on how the cpu
 351    deals with a load to a line that is being prefetched.  */
 352 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)