amd.c

   1 /*
   2  * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  17  *
  18  */
  19
  20
  21 #include <stdlib.h>
  22
  23 #undef memcpy
  24
  25 void * mymemcpy(void *dest, const void *src, size_t size)
  26 {
  27         asm goto (
  28         "movq   %0, %%rsi\n\t"
  29         "movq   %1, %%rdi\n\t"
  30         "movq   %2, %%rdx\n\t"
  31         "movq    %%rdi, %%rax\n\t"
  32         "cmp     $32, %%rdx\n\t"
  33         "jb      less_vec\n\t"
  34         "cmp     $(32 * 2), %%rdx\n\t"
  35         "ja      more_2x_vec\n\t"
  36         "vmovdqu   (%%rsi), %%ymm0\n\t"
  37         "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
  38         "vmovdqu   %%ymm0, (%%rdi)\n\t"
  39         "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
  40         "vzeroupper\n\t"
  41         "jmp %l[done]\n\t"
  42         "less_vec:\n\t"
  43         /* Less than 1 VEC.  */
  44         "cmpb    $32, %%dl\n\t"
  45         "jae     between_32_63\n\t"
  46         "cmpb    $16, %%dl\n\t"
  47         "jae     between_16_31\n\t"
  48         "cmpb    $8, %%dl\n\t"
  49         "jae     between_8_15\n\t"
  50         "cmpb    $4, %%dl\n\t"
  51         "jae     between_4_7\n\t"
  52         "cmpb    $1, %%dl\n\t"
  53         "ja      between_2_3\n\t"
  54         "jb      1f\n\t"
  55         "movzbl  (%%rsi), %%ecx\n\t"
  56         "movb    %%cl, (%%rdi)\n\t"
  57         "1:\n\t"
  58         "jmp %l[done]\n\t"
  59         "between_32_63:\n\t"
  60         /* From 32 to 63.  No branch when size == 32.  */
  61         "vmovdqu (%%rsi), %%ymm0\n\t"
  62         "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
  63         "vmovdqu %%ymm0, (%%rdi)\n\t"
  64         "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
  65         "vzeroupper\n\t"
  66         "jmp %l[done]\n\t"
  67         /* From 16 to 31.  No branch when size == 16.  */
  68         "between_16_31:\n\t"
  69         "vmovdqu (%%rsi), %%xmm0\n\t"
  70         "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
  71         "vmovdqu %%xmm0, (%%rdi)\n\t"
  72         "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
  73         "jmp %l[done]\n\t"
  74         "between_8_15:\n\t"
  75         /* From 8 to 15.  No branch when size == 8.  */
  76         "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
  77         "movq    (%%rsi), %%rsi\n\t"
  78         "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
  79         "movq    %%rsi, (%%rdi)\n\t"
  80         "jmp %l[done]\n\t"
  81         "between_4_7:\n\t"
  82         /* From 4 to 7.  No branch when size == 4.  */
  83         "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
  84         "movl    (%%rsi), %%esi\n\t"
  85         "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
  86         "movl    %%esi, (%%rdi)\n\t"
  87         "jmp %l[done]\n\t"
  88         "between_2_3:\n\t"
  89         /* From 2 to 3.  No branch when size == 2.  */
  90         "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
  91         "movzwl  (%%rsi), %%esi\n\t"
  92         "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
  93         "movw    %%si, (%%rdi)\n\t"
  94         "jmp %l[done]\n\t"
  95         "more_2x_vec:\n\t"
  96         /* More than 2 * VEC and there may be overlap between destination */
  97         /* and source.  */
  98         "cmpq    $(32 * 8), %%rdx\n\t"
  99         "ja      more_8x_vec\n\t"
 100         "cmpq    $(32 * 4), %%rdx\n\t"
 101         "jb      last_4x_vec\n\t"
 102         /* Copy from 4 * VEC to 8 * VEC, inclusively. */
 103         "vmovdqu   (%%rsi), %%ymm0\n\t"
 104         "vmovdqu   32(%%rsi), %%ymm1\n\t"
 105         "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
 106         "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
 107         "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
 108         "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
 109         "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
 110         "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
 111         "vmovdqu   %%ymm0, (%%rdi)\n\t"
 112         "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
 113         "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
 114         "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
 115         "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
 116         "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
 117         "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
 118         "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
 119         "vzeroupper\n\t"
 120         "jmp %l[done]\n\t"
 121         "last_4x_vec:\n\t"
 122         /* Copy from 2 * VEC to 4 * VEC. */
 123         "vmovdqu   (%%rsi), %%ymm0\n\t"
 124         "vmovdqu   32(%%rsi), %%ymm1\n\t"
 125         "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
 126         "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
 127         "vmovdqu   %%ymm0, (%%rdi)\n\t"
 128         "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
 129         "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
 130         "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
 131         "vzeroupper\n\t"
 132         "nop:\n\t"
 133         "jmp %l[done]\n\t"
 134         "more_8x_vec:\n\t"
 135         "cmpq    %%rsi, %%rdi\n\t"
 136         "ja      more_8x_vec_backward\n\t"
 137         /* Source == destination is less common.  */
 138         "je      nop\n\t"
 139         /* Load the first VEC and last 4 * VEC to support overlapping addresses.  */
 140         "vmovdqu   (%%rsi), %%ymm4\n\t"
 141         "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
 142         "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
 143         "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
 144         "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
 145         /* Save start and stop of the destination buffer.  */
 146         "movq    %%rdi, %%r11\n\t"
 147         "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
 148         /* Align destination for aligned stores in the loop.  Compute */
 149         /* how much destination is misaligned.  */
 150         "movq    %%rdi, %%r8\n\t"
 151         "andq    $(32 - 1), %%r8\n\t"
 152         /* Get the negative of offset for alignment.  */
 153         "subq    $32, %%r8\n\t"
 154         /* Adjust source.  */
 155         "subq    %%r8, %%rsi\n\t"
 156         /* Adjust destination which should be aligned now.  */
 157         "subq    %%r8, %%rdi\n\t"
 158         /* Adjust length.  */
 159         "addq    %%r8, %%rdx\n\t"
 160         /* Check non-temporal store threshold.  */
 161         "cmpq    $(1024*1024), %%rdx\n\t"
 162         "ja      large_forward\n\t"
 163         "loop_4x_vec_forward:\n\t"
 164         /* Copy 4 * VEC a time forward.  */
 165         "vmovdqu   (%%rsi), %%ymm0\n\t"
 166         "vmovdqu   32(%%rsi), %%ymm1\n\t"
 167         "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
 168         "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
 169         "addq    $(32 * 4), %%rsi\n\t"
 170         "subq    $(32 * 4), %%rdx\n\t"
 171         "vmovdqa   %%ymm0, (%%rdi)\n\t"
 172         "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
 173         "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
 174         "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
 175         "addq    $(32 * 4), %%rdi\n\t"
 176         "cmpq    $(32 * 4), %%rdx\n\t"
 177         "ja      loop_4x_vec_forward\n\t"
 178         /* Store the last 4 * VEC.  */
 179         "vmovdqu   %%ymm5, (%%rcx)\n\t"
 180         "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
 181         "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
 182         "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
 183         /* Store the first VEC.  */
 184         "vmovdqu   %%ymm4, (%%r11)\n\t"
 185         "vzeroupper\n\t"
 186         "jmp %l[done]\n\t"
 187         "more_8x_vec_backward:\n\t"
 188         /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
 189         "vmovdqu   (%%rsi), %%ymm4\n\t"
 190         "vmovdqu   32(%%rsi), %%ymm5\n\t"
 191         "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
 192         "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
 193         "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
 194         /* Save stop of the destination buffer.  */
 195         "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
 196         /* Align destination end for aligned stores in the loop.  Compute */
 197         /* how much destination end is misaligned.  */
 198         "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
 199         "movq    %%r11, %%r9\n\t"
 200         "movq    %%r11, %%r8\n\t"
 201         "andq    $(32 - 1), %%r8\n\t"
 202         /* Adjust source.  */
 203         "subq    %%r8, %%rcx\n\t"
 204         /* Adjust the end of destination which should be aligned now.  */
 205         "subq    %%r8, %%r9\n\t"
 206         /* Adjust length.  */
 207         "subq    %%r8, %%rdx\n\t"
 208          /* Check non-temporal store threshold.  */
 209         "cmpq    $(1024*1024), %%rdx\n\t"
 210         "ja      large_backward\n\t"
 211         "loop_4x_vec_backward:\n\t"
 212         /* Copy 4 * VEC a time backward.  */
 213         "vmovdqu   (%%rcx), %%ymm0\n\t"
 214         "vmovdqu   -32(%%rcx), %%ymm1\n\t"
 215         "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
 216         "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
 217         "subq    $(32 * 4), %%rcx\n\t"
 218         "subq    $(32 * 4), %%rdx\n\t"
 219         "vmovdqa   %%ymm0, (%%r9)\n\t"
 220         "vmovdqa   %%ymm1, -32(%%r9)\n\t"
 221         "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
 222         "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
 223         "subq    $(32 * 4), %%r9\n\t"
 224         "cmpq    $(32 * 4), %%rdx\n\t"
 225         "ja      loop_4x_vec_backward\n\t"
 226         /* Store the first 4 * VEC. */
 227         "vmovdqu   %%ymm4, (%%rdi)\n\t"
 228         "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
 229         "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
 230         "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
 231         /* Store the last VEC. */
 232         "vmovdqu   %%ymm8, (%%r11)\n\t"
 233         "vzeroupper\n\t"
 234         "jmp %l[done]\n\t"
 235
 236         "large_forward:\n\t"
 237         /* Don't use non-temporal store if there is overlap between */
 238         /* destination and source since destination may be in cache */
 239         /* when source is loaded. */
 240         "leaq    (%%rdi, %%rdx), %%r10\n\t"
 241         "cmpq    %%r10, %%rsi\n\t"
 242         "jb      loop_4x_vec_forward\n\t"
 243         "loop_large_forward:\n\t"
 244         /* Copy 4 * VEC a time forward with non-temporal stores.  */
 245         "prefetcht0 (32*4*2)(%%rsi)\n\t"
 246         "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
 247         "prefetcht0 (32*4*3)(%%rsi)\n\t"
 248         "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
 249         "vmovdqu   (%%rsi), %%ymm0\n\t"
 250         "vmovdqu   32(%%rsi), %%ymm1\n\t"
 251         "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
 252         "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
 253         "addq    $(32*4), %%rsi\n\t"
 254         "subq    $(32*4), %%rdx\n\t"
 255         "vmovntdq  %%ymm0, (%%rdi)\n\t"
 256         "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
 257         "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
 258         "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
 259         "addq    $(32*4), %%rdi\n\t"
 260         "cmpq    $(32*4), %%rdx\n\t"
 261         "ja      loop_large_forward\n\t"
 262         "sfence\n\t"
 263         /* Store the last 4 * VEC.  */
 264         "vmovdqu   %%ymm5, (%%rcx)\n\t"
 265         "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
 266         "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
 267         "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
 268         /* Store the first VEC.  */
 269         "vmovdqu   %%ymm4, (%%r11)\n\t"
 270         "vzeroupper\n\t"
 271         "jmp %l[done]\n\t"
 272         "large_backward:\n\t"
 273         /* Don't use non-temporal store if there is overlap between */
 274         /* destination and source since destination may be in cache */
 275         /* when source is loaded.  */
 276         "leaq    (%%rcx, %%rdx), %%r10\n\t"
 277         "cmpq    %%r10, %%r9\n\t"
 278         "jb      loop_4x_vec_backward\n\t"
 279         "loop_large_backward:\n\t"
 280         /* Copy 4 * VEC a time backward with non-temporal stores. */
 281         "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
 282         "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
 283         "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
 284         "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
 285         "vmovdqu   (%%rcx), %%ymm0\n\t"
 286         "vmovdqu   -32(%%rcx), %%ymm1\n\t"
 287         "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
 288         "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
 289         "subq    $(32*4), %%rcx\n\t"
 290         "subq    $(32*4), %%rdx\n\t"
 291         "vmovntdq  %%ymm0, (%%r9)\n\t"
 292         "vmovntdq  %%ymm1, -32(%%r9)\n\t"
 293         "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
 294         "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
 295         "subq    $(32 * 4), %%r9\n\t"
 296         "cmpq    $(32 * 4), %%rdx\n\t"
 297         "ja      loop_large_backward\n\t"
 298         "sfence\n\t"
 299         /* Store the first 4 * VEC.  */
 300         "vmovdqu   %%ymm4, (%%rdi)\n\t"
 301         "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
 302         "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
 303         "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
 304         /* Store the last VEC.  */
 305         "vmovdqu   %%ymm8, (%%r11)\n\t"
 306         "vzeroupper\n\t"
 307         "jmp %l[done]"
 308         :
 309         : "r"(src), "r"(dest), "r"(size)
 310         : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
 311         "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
 312         : done
 313         );
 314 done:
 315         return dest;
 316
 317 }