source/libs/gmp/gmp-src/mpn/x86/pentium/copyi.asm

   1 dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
   2
   3 dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P5: 1.25 cycles/limb
  35
  36
  37 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
  38 C
  39 C Destination prefetching is done to avoid repeated write-throughs on lines
  40 C not already in L1.
  41 C
  42 C At least one of the src or dst pointer needs to be incremented rather than
  43 C using indexing, so that there's somewhere to put the loop control without
  44 C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
  45 C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
  46 C subtracts in the finishup code.
  47 C
  48 C The block of finishup code is almost as big as the main loop itself, which
  49 C is unfortunate, but it's faster that way than with say rep movsl, by about
  50 C 10 cycles for instance on P55.
  51 C
  52 C There's nothing to be gained from MMX on P55, since it can do only one
  53 C movq load (or store) per cycle, so the throughput would be the same as the
  54 C code here (and even then only if src and dst have the same alignment mod
  55 C 8).
  56
  57 defframe(PARAM_SIZE,12)
  58 defframe(PARAM_SRC, 8)
  59 defframe(PARAM_DST, 4)
  60
  61         TEXT
  62         ALIGN(8)
  63 PROLOGUE(mpn_copyi)
  64 deflit(`FRAME',0)
  65
  66         movl    PARAM_SIZE, %ecx
  67         movl    PARAM_DST, %edx
  68
  69         pushl   %ebx    FRAME_pushl()
  70         pushl   %esi    FRAME_pushl()
  71
  72         leal    (%edx,%ecx,4), %edx     C &dst[size-1]
  73         xorl    $-1, %ecx               C -size-1
  74
  75         movl    PARAM_SRC, %esi
  76         addl    $8, %ecx                C -size+7
  77
  78         jns     L(end)
  79
  80         movl    -28(%edx,%ecx,4), %eax  C fetch destination cache line, dst[0]
  81         nop
  82
  83 L(top):
  84         C eax   scratch
  85         C ebx   scratch
  86         C ecx   counter, limbs, negative
  87         C edx   &dst[size-1]
  88         C esi   src, incrementing
  89         C edi
  90         C ebp
  91
  92         movl    (%edx,%ecx,4), %eax     C fetch destination cache line
  93         addl    $8, %ecx
  94
  95         movl    (%esi), %eax            C read words pairwise
  96         movl    4(%esi), %ebx
  97         movl    %eax, -60(%edx,%ecx,4)  C store words pairwise
  98         movl    %ebx, -56(%edx,%ecx,4)
  99
 100         movl    8(%esi), %eax
 101         movl    12(%esi), %ebx
 102         movl    %eax, -52(%edx,%ecx,4)
 103         movl    %ebx, -48(%edx,%ecx,4)
 104
 105         movl    16(%esi), %eax
 106         movl    20(%esi), %ebx
 107         movl    %eax, -44(%edx,%ecx,4)
 108         movl    %ebx, -40(%edx,%ecx,4)
 109
 110         movl    24(%esi), %eax
 111         movl    28(%esi), %ebx
 112         movl    %eax, -36(%edx,%ecx,4)
 113         movl    %ebx, -32(%edx,%ecx,4)
 114
 115         leal    32(%esi), %esi
 116         js      L(top)
 117
 118
 119 L(end):
 120         C ecx   0 to 7, representing respectively 7 to 0 limbs remaining
 121         C esi   src end
 122         C edx   dst, next location to store
 123
 124         subl    $4, %ecx
 125         jns     L(no4)
 126
 127         movl    (%esi), %eax
 128         movl    4(%esi), %ebx
 129         movl    %eax, -12(%edx,%ecx,4)
 130         movl    %ebx, -8(%edx,%ecx,4)
 131
 132         movl    8(%esi), %eax
 133         movl    12(%esi), %ebx
 134         movl    %eax, -4(%edx,%ecx,4)
 135         movl    %ebx, (%edx,%ecx,4)
 136
 137         addl    $16, %esi
 138         addl    $4, %ecx
 139 L(no4):
 140
 141         subl    $2, %ecx
 142         jns     L(no2)
 143
 144         movl    (%esi), %eax
 145         movl    4(%esi), %ebx
 146         movl    %eax, -4(%edx,%ecx,4)
 147         movl    %ebx, (%edx,%ecx,4)
 148
 149         addl    $8, %esi
 150         addl    $2, %ecx
 151 L(no2):
 152
 153         jnz     L(done)
 154
 155         movl    (%esi), %eax
 156         movl    %eax, -4(%edx,%ecx,4)   C risk of cache bank clash here
 157
 158 L(done):
 159         popl    %esi
 160         popl    %ebx
 161
 162         ret
 163
 164 EPILOGUE()