include/asm-x86/xor_64.h

   1 /*
   2  * Optimized RAID-5 checksumming functions for MMX and SSE.
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2, or (at your option)
   7  * any later version.
   8  *
   9  * You should have received a copy of the GNU General Public License
  10  * (for example /usr/src/linux/COPYING); if not, write to the Free
  11  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  12  */
  13
  14
  15 /*
  16  * Cache avoiding checksumming functions utilizing KNI instructions
  17  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  18  */
  19
  20 /*
  21  * Based on
  22  * High-speed RAID5 checksumming functions utilizing SSE instructions.
  23  * Copyright (C) 1998 Ingo Molnar.
  24  */
  25
  26 /*
  27  * x86-64 changes / gcc fixes from Andi Kleen.
  28  * Copyright 2002 Andi Kleen, SuSE Labs.
  29  *
  30  * This hasn't been optimized for the hammer yet, but there are likely
  31  * no advantages to be gotten from x86-64 here anyways.
  32  */
  33
  34 typedef struct {
  35         unsigned long a, b;
  36 } __attribute__((aligned(16))) xmm_store_t;
  37
  38 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
  39    tell it to do a clts before the register saving. */
  40 #define XMMS_SAVE                               \
  41 do {                                            \
  42         preempt_disable();                      \
  43         asm volatile(                           \
  44                 "movq %%cr0,%0          ;\n\t"  \
  45                 "clts                   ;\n\t"  \
  46                 "movups %%xmm0,(%1)     ;\n\t"  \
  47                 "movups %%xmm1,0x10(%1) ;\n\t"  \
  48                 "movups %%xmm2,0x20(%1) ;\n\t"  \
  49                 "movups %%xmm3,0x30(%1) ;\n\t"  \
  50                 : "=&r" (cr0)                   \
  51                 : "r" (xmm_save)                \
  52                 : "memory");                    \
  53 } while (0)
  54
  55 #define XMMS_RESTORE                            \
  56 do {                                            \
  57         asm volatile(                           \
  58                 "sfence                 ;\n\t"  \
  59                 "movups (%1),%%xmm0     ;\n\t"  \
  60                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
  61                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
  62                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
  63                 "movq   %0,%%cr0        ;\n\t"  \
  64                 :                               \
  65                 : "r" (cr0), "r" (xmm_save)     \
  66                 : "memory");                    \
  67         preempt_enable();                       \
  68 } while (0)
  69
  70 #define OFFS(x)         "16*("#x")"
  71 #define PF_OFFS(x)      "256+16*("#x")"
  72 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  73 #define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
  74 #define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
  75 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  76 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  77 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  78 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  79 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
  80 #define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
  81 #define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
  82 #define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
  83 #define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
  84 #define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
  85
  86
  87 static void
  88 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  89 {
  90         unsigned int lines = bytes >> 8;
  91         unsigned long cr0;
  92         xmm_store_t xmm_save[4];
  93
  94         XMMS_SAVE;
  95
  96         asm volatile(
  97 #undef BLOCK
  98 #define BLOCK(i) \
  99                 LD(i, 0)                                \
 100                         LD(i + 1, 1)                    \
 101                 PF1(i)                                  \
 102                                 PF1(i + 2)              \
 103                                 LD(i + 2, 2)            \
 104                                         LD(i + 3, 3)    \
 105                 PF0(i + 4)                              \
 106                                 PF0(i + 6)              \
 107                 XO1(i, 0)                               \
 108                         XO1(i + 1, 1)                   \
 109                                 XO1(i + 2, 2)           \
 110                                         XO1(i + 3, 3)   \
 111                 ST(i, 0)                                \
 112                         ST(i + 1, 1)                    \
 113                                 ST(i + 2, 2)            \
 114                                         ST(i + 3, 3)    \
 115
 116
 117                 PF0(0)
 118                                 PF0(2)
 119
 120         " .align 32                     ;\n"
 121         " 1:                            ;\n"
 122
 123                 BLOCK(0)
 124                 BLOCK(4)
 125                 BLOCK(8)
 126                 BLOCK(12)
 127
 128         "       addq %[inc], %[p1]           ;\n"
 129         "       addq %[inc], %[p2]           ;\n"
 130                 "               decl %[cnt] ; jnz 1b"
 131         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
 132         : [inc] "r" (256UL)
 133         : "memory");
 134
 135         XMMS_RESTORE;
 136 }
 137
 138 static void
 139 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 140           unsigned long *p3)
 141 {
 142         unsigned int lines = bytes >> 8;
 143         xmm_store_t xmm_save[4];
 144         unsigned long cr0;
 145
 146         XMMS_SAVE;
 147
 148         asm volatile(
 149 #undef BLOCK
 150 #define BLOCK(i) \
 151                 PF1(i)                                  \
 152                                 PF1(i + 2)              \
 153                 LD(i, 0)                                        \
 154                         LD(i + 1, 1)                    \
 155                                 LD(i + 2, 2)            \
 156                                         LD(i + 3, 3)    \
 157                 PF2(i)                                  \
 158                                 PF2(i + 2)              \
 159                 PF0(i + 4)                              \
 160                                 PF0(i + 6)              \
 161                 XO1(i, 0)                               \
 162                         XO1(i + 1, 1)                   \
 163                                 XO1(i + 2, 2)           \
 164                                         XO1(i + 3, 3)   \
 165                 XO2(i, 0)                               \
 166                         XO2(i + 1, 1)                   \
 167                                 XO2(i + 2, 2)           \
 168                                         XO2(i + 3, 3)   \
 169                 ST(i, 0)                                \
 170                         ST(i + 1, 1)                    \
 171                                 ST(i + 2, 2)            \
 172                                         ST(i + 3, 3)    \
 173
 174
 175                 PF0(0)
 176                                 PF0(2)
 177
 178         " .align 32                     ;\n"
 179         " 1:                            ;\n"
 180
 181                 BLOCK(0)
 182                 BLOCK(4)
 183                 BLOCK(8)
 184                 BLOCK(12)
 185
 186         "       addq %[inc], %[p1]           ;\n"
 187         "       addq %[inc], %[p2]          ;\n"
 188         "       addq %[inc], %[p3]           ;\n"
 189                 "               decl %[cnt] ; jnz 1b"
 190         : [cnt] "+r" (lines),
 191           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 192         : [inc] "r" (256UL)
 193         : "memory");
 194         XMMS_RESTORE;
 195 }
 196
 197 static void
 198 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 199           unsigned long *p3, unsigned long *p4)
 200 {
 201         unsigned int lines = bytes >> 8;
 202         xmm_store_t xmm_save[4];
 203         unsigned long cr0;
 204
 205         XMMS_SAVE;
 206
 207         asm volatile(
 208 #undef BLOCK
 209 #define BLOCK(i) \
 210                 PF1(i)                                  \
 211                                 PF1(i + 2)              \
 212                 LD(i, 0)                                \
 213                         LD(i + 1, 1)                    \
 214                                 LD(i + 2, 2)            \
 215                                         LD(i + 3, 3)    \
 216                 PF2(i)                                  \
 217                                 PF2(i + 2)              \
 218                 XO1(i, 0)                               \
 219                         XO1(i + 1, 1)                   \
 220                                 XO1(i + 2, 2)           \
 221                                         XO1(i + 3, 3)   \
 222                 PF3(i)                                  \
 223                                 PF3(i + 2)              \
 224                 PF0(i + 4)                              \
 225                                 PF0(i + 6)              \
 226                 XO2(i, 0)                               \
 227                         XO2(i + 1, 1)                   \
 228                                 XO2(i + 2, 2)           \
 229                                         XO2(i + 3, 3)   \
 230                 XO3(i, 0)                               \
 231                         XO3(i + 1, 1)                   \
 232                                 XO3(i + 2, 2)           \
 233                                         XO3(i + 3, 3)   \
 234                 ST(i, 0)                                \
 235                         ST(i + 1, 1)                    \
 236                                 ST(i + 2, 2)            \
 237                                         ST(i + 3, 3)    \
 238
 239
 240                 PF0(0)
 241                                 PF0(2)
 242
 243         " .align 32                     ;\n"
 244         " 1:                            ;\n"
 245
 246                 BLOCK(0)
 247                 BLOCK(4)
 248                 BLOCK(8)
 249                 BLOCK(12)
 250
 251         "       addq %[inc], %[p1]           ;\n"
 252         "       addq %[inc], %[p2]           ;\n"
 253         "       addq %[inc], %[p3]           ;\n"
 254         "       addq %[inc], %[p4]           ;\n"
 255         "       decl %[cnt] ; jnz 1b"
 256         : [cnt] "+c" (lines),
 257           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 258         : [inc] "r" (256UL)
 259         : "memory" );
 260
 261         XMMS_RESTORE;
 262 }
 263
 264 static void
 265 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 266           unsigned long *p3, unsigned long *p4, unsigned long *p5)
 267 {
 268         unsigned int lines = bytes >> 8;
 269         xmm_store_t xmm_save[4];
 270         unsigned long cr0;
 271
 272         XMMS_SAVE;
 273
 274         asm volatile(
 275 #undef BLOCK
 276 #define BLOCK(i) \
 277                 PF1(i)                                  \
 278                                 PF1(i + 2)              \
 279                 LD(i, 0)                                \
 280                         LD(i + 1, 1)                    \
 281                                 LD(i + 2, 2)            \
 282                                         LD(i + 3, 3)    \
 283                 PF2(i)                                  \
 284                                 PF2(i + 2)              \
 285                 XO1(i, 0)                               \
 286                         XO1(i + 1, 1)                   \
 287                                 XO1(i + 2, 2)           \
 288                                         XO1(i + 3, 3)   \
 289                 PF3(i)                                  \
 290                                 PF3(i + 2)              \
 291                 XO2(i, 0)                               \
 292                         XO2(i + 1, 1)                   \
 293                                 XO2(i + 2, 2)           \
 294                                         XO2(i + 3, 3)   \
 295                 PF4(i)                                  \
 296                                 PF4(i + 2)              \
 297                 PF0(i + 4)                              \
 298                                 PF0(i + 6)              \
 299                 XO3(i, 0)                               \
 300                         XO3(i + 1, 1)                   \
 301                                 XO3(i + 2, 2)           \
 302                                         XO3(i + 3, 3)   \
 303                 XO4(i, 0)                               \
 304                         XO4(i + 1, 1)                   \
 305                                 XO4(i + 2, 2)           \
 306                                         XO4(i + 3, 3)   \
 307                 ST(i, 0)                                \
 308                         ST(i + 1, 1)                    \
 309                                 ST(i + 2, 2)            \
 310                                         ST(i + 3, 3)    \
 311
 312
 313                 PF0(0)
 314                                 PF0(2)
 315
 316         " .align 32                     ;\n"
 317         " 1:                            ;\n"
 318
 319                 BLOCK(0)
 320                 BLOCK(4)
 321                 BLOCK(8)
 322                 BLOCK(12)
 323
 324         "       addq %[inc], %[p1]           ;\n"
 325         "       addq %[inc], %[p2]           ;\n"
 326         "       addq %[inc], %[p3]           ;\n"
 327         "       addq %[inc], %[p4]           ;\n"
 328         "       addq %[inc], %[p5]           ;\n"
 329         "       decl %[cnt] ; jnz 1b"
 330         : [cnt] "+c" (lines),
 331           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
 332           [p5] "+r" (p5)
 333         : [inc] "r" (256UL)
 334         : "memory");
 335
 336         XMMS_RESTORE;
 337 }
 338
 339 static struct xor_block_template xor_block_sse = {
 340         .name = "generic_sse",
 341         .do_2 = xor_sse_2,
 342         .do_3 = xor_sse_3,
 343         .do_4 = xor_sse_4,
 344         .do_5 = xor_sse_5,
 345 };
 346
 347 #undef XOR_TRY_TEMPLATES
 348 #define XOR_TRY_TEMPLATES                       \
 349 do {                                            \
 350         xor_speed(&xor_block_sse);              \
 351 } while (0)
 352
 353 /* We force the use of the SSE xor block because it can write around L2.
 354    We may also be able to load into the L1 only depending on how the cpu
 355    deals with a load to a line that is being prefetched.  */
 356 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)