test/sysperf/memcpy.S

   1
   2                 /*
   3                  * memcpy.S
   4                  *
   5                  * x86_64: MOVNTQ vs MOVQ, MOVNTDQ vs MOVDQ[A/U], PREFETCH[x]
   6                  *
   7                  *      NT stands for 'non-temportal', which basically means
   8                  *      'bypass L1 cache on write'.  Write bandwidth is
   9                  *      effectively reduced to the L2 cache bandwidth but
  10                  *      the L1 cache will not be wiped out by the copy.
  11                  *
  12                  *      DO NOT MIX 'nt' and standard writes!  Your performance
  13                  *      will go poof.
  14                  *
  15                  * PREFETCH[NTA,T0,T1,T2]
  16                  *
  17                  *      These instructions prefetch a cache line (typically
  18                  *      128 bytes).  'NT' means 'non-temporal', which bypasses
  19                  *      the L1 cache if the data is not already in the L1
  20                  *      cache.  HOWEVER, using PREFETCHNT can put a slow memory
  21                  *      op in the cpu's memory request queue if a L1 or L2
  22                  *      miss occurs, and it can stall an L1-cache-hit access
  23                  *      for a small but noticeable period of time, so it is
  24                  *      a good idea not to put a memory op just after a
  25                  *      prefetchnta instruction.
  26                  *
  27                  *      You can get better L2 bandwidth using prefetchnt but
  28                  *      it will not be much more then prefetcht0 and
  29                  *      'prefetcht0' will give you better cache-miss
  30                  *      bandwidth.
  31                  *
  32                  *      The prefetch has to be done far enough ahead to do
  33                  *      some good, but it only has a significant effect when
  34                  *      it is able to move date from L2 to L1.  Prefetching
  35                  *      from main memory does not have a significant effect
  36                  *      durign a copy or zeroing operation because main
  37                  *      memory bandwidth is already saturated.
  38                  *
  39                  * $DragonFly: src/test/sysperf/memcpy.S,v 1.1 2004/04/29 16:14:53 dillon Exp $
  40                  */
  41                 .text
  42                 .globl  docopy1
  43                 .globl  docopy2
  44                 .globl  docopy3
  45                 .globl  docopy4
  46                 .globl  docopy5
  47                 .globl  docopy6
  48                 .globl  docopy7
  49                 .globl  fpcleanup
  50
  51                 .p2align 4,0x90
  52 docopy1:
  53                 pushl   %esi
  54                 pushl   %edi
  55                 pushl   %ecx
  56                 pushl   %ebx
  57
  58                 movl    4+16(%esp),%esi
  59                 movl    8+16(%esp),%edi
  60                 movl    12+16(%esp),%ecx
  61                 shrl    $2,%ecx
  62                 cld
  63                 rep
  64                 movsl
  65                 popl    %ebx
  66                 popl    %ecx
  67                 popl    %edi
  68                 popl    %esi
  69                 ret
  70
  71                 .p2align 4,0x90
  72 docopy2:
  73                 pushl   %esi
  74                 pushl   %edi
  75                 pushl   %ecx
  76                 pushl   %ebx
  77
  78                 movl    4+16(%esp),%esi
  79                 movl    8+16(%esp),%edi
  80                 movl    12+16(%esp),%ecx
  81                 addl    %ecx,%esi
  82                 addl    %ecx,%edi
  83                 shrl    $2,%ecx
  84                 std
  85                 rep
  86                 movsl
  87                 popl    %ebx
  88                 popl    %ecx
  89                 popl    %edi
  90                 popl    %esi
  91                 ret
  92
  93                 .p2align 4,0x90
  94 docopy3:
  95                 pushl   %esi
  96                 pushl   %edi
  97                 pushl   %ecx
  98                 pushl   %ebx
  99
 100                 movl    4+16(%esp),%esi
 101                 movl    8+16(%esp),%edi
 102                 movl    12+16(%esp),%ecx
 103
 104                 .p2align 4,0x90
 105 1:
 106                 movl    (%esi),%eax
 107                 movl    4(%esi),%ebx
 108                 movl    8(%esi),%edx
 109                 movl    %eax,(%edi)
 110                 movl    12(%esi),%eax
 111                 movl    %ebx,4(%edi)
 112                 movl    16(%esi),%ebx
 113                 movl    %edx,8(%edi)
 114                 movl    20(%esi),%edx
 115                 movl    %eax,12(%edi)
 116                 movl    24(%esi),%eax
 117                 movl    %ebx,16(%edi)
 118                 movl    28(%esi),%ebx
 119                 movl    %edx,20(%edi)
 120                 prefetcht0 96(%esi)
 121                 subl    $32,%ecx
 122                 movl    %eax,24(%edi)
 123                 addl    $32,%esi
 124                 movl    %ebx,28(%edi)
 125                 addl    $32,%edi
 126
 127                 testl   %ecx,%ecx
 128                 jnz     1b
 129
 130                 popl    %ebx
 131                 popl    %ecx
 132                 popl    %edi
 133                 popl    %esi
 134                 ret
 135
 136                 .p2align 4,0x90
 137 docopy4:
 138                 pushl   %esi
 139                 pushl   %edi
 140                 pushl   %ecx
 141                 pushl   %ebx
 142
 143                 movl    4+16(%esp),%esi
 144                 movl    8+16(%esp),%edi
 145                 movl    12+16(%esp),%ecx
 146
 147                 .p2align 4,0x90
 148 1:
 149                 movl    (%esi),%eax
 150                 movl    4(%esi),%ebx
 151                 addl    $8,%esi
 152                 prefetcht0 64(%esi)
 153                 subl    $8,%ecx
 154                 movl    %eax,(%edi)
 155                 movl    %ebx,4(%edi)
 156                 addl    $8,%edi
 157                 testl   %ecx,%ecx
 158                 jnz     1b
 159
 160                 popl    %ebx
 161                 popl    %ecx
 162                 popl    %edi
 163                 popl    %esi
 164                 ret
 165
 166                 .p2align 4,0x90
 167 docopy5:
 168                 pushl   %esi
 169                 pushl   %edi
 170                 pushl   %ecx
 171                 pushl   %ebx
 172
 173                 movl    4+16(%esp),%esi
 174                 movl    8+16(%esp),%edi
 175                 movl    12+16(%esp),%ecx
 176
 177                 .p2align 4,0x90
 178 1:
 179                 movq    (%esi),%mm0
 180                 movq    8(%esi),%mm1
 181                 movq    16(%esi),%mm2
 182                 movq    24(%esi),%mm3
 183                 movq    32(%esi),%mm4
 184                 movq    40(%esi),%mm5
 185                 movq    48(%esi),%mm6
 186                 movq    56(%esi),%mm7
 187                 prefetchnta 128(%esi)
 188                 subl    $64,%ecx
 189                 addl    $64,%esi
 190                 movq    %mm0,(%edi)
 191                 movq    %mm1,8(%edi)
 192                 movq    %mm2,16(%edi)
 193                 movq    %mm3,24(%edi)
 194                 movq    %mm4,32(%edi)
 195                 movq    %mm5,40(%edi)
 196                 movq    %mm6,48(%edi)
 197                 movq    %mm7,56(%edi)
 198                 addl    $64,%edi
 199                 testl   %ecx,%ecx
 200                 jnz     1b
 201
 202                 popl    %ebx
 203                 popl    %ecx
 204                 popl    %edi
 205                 popl    %esi
 206                 ret
 207
 208                 .p2align 4,0x90
 209 docopy6:
 210                 pushl   %esi
 211                 pushl   %edi
 212                 pushl   %ecx
 213                 pushl   %ebx
 214
 215                 movl    4+16(%esp),%esi
 216                 movl    8+16(%esp),%edi
 217                 movl    12+16(%esp),%ecx
 218                 movl    $16,%eax
 219
 220                 .p2align 4,0x90
 221 1:
 222                 prefetcht0 96(%esi)
 223                 subl    %eax,%ecx
 224                 movq    (%esi),%mm0
 225                 movq    8(%esi),%mm1
 226                 addl    %eax,%esi
 227                 movntq  %mm0,(%edi)
 228                 movntq  %mm1,8(%edi)
 229                 addl    %eax,%edi
 230                 testl   %ecx,%ecx
 231                 jnz     1b
 232
 233                 popl    %ebx
 234                 popl    %ecx
 235                 popl    %edi
 236                 popl    %esi
 237                 ret
 238
 239                 .p2align 4,0x90
 240 docopy7:
 241                 pushl   %esi
 242                 pushl   %edi
 243                 pushl   %ecx
 244                 pushl   %ebx
 245
 246                 movl    4+16(%esp),%esi
 247                 movl    8+16(%esp),%edi
 248                 movl    12+16(%esp),%ecx
 249                 movl    $128,%eax
 250
 251                 .p2align 4,0x90
 252 1:
 253                 movdqa  (%esi),%xmm0
 254                 movdqa  16(%esi),%xmm1
 255                 movdqa  32(%esi),%xmm2
 256                 movdqa  48(%esi),%xmm3
 257                 movdqa  64(%esi),%xmm4
 258                 movdqa  80(%esi),%xmm5
 259                 movdqa  96(%esi),%xmm6
 260                 movdqa  112(%esi),%xmm7
 261                 subl    %eax,%ecx
 262                 addl    %eax,%esi
 263                 movntdq  %xmm0,(%edi)
 264                 movntdq  %xmm1,16(%edi)
 265                 movntdq  %xmm2,32(%edi)
 266                 movntdq  %xmm3,48(%edi)
 267                 movntdq  %xmm4,64(%edi)
 268                 movntdq  %xmm5,80(%edi)
 269                 movntdq  %xmm6,96(%edi)
 270                 movntdq  %xmm7,112(%edi)
 271                 addl    %eax,%edi
 272                 testl   %ecx,%ecx
 273                 jnz     1b
 274
 275                 popl    %ebx
 276                 popl    %ecx
 277                 popl    %edi
 278                 popl    %esi
 279                 ret
 280
 281                 .p2align 4,0x90
 282 fpcleanup:
 283                 fninit
 284                 ret
 285