source/libs/gmp/gmp-src/mpn/x86/p6/copyd.asm

   1 dnl  Intel P6 mpn_copyd -- copy limb vector backwards.
   2
   3 dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P6: 1.75 cycles/limb, or 0.75 if no overlap
  35
  36
  37 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
  38 C
  39 C An explicit loop is used because a decrementing rep movsl is a bit slow at
  40 C 2.4 c/l.  That rep movsl also has about a 40 cycle startup time, and the
  41 C code here stands a chance of being faster if the branches predict well.
  42 C
  43 C The slightly strange loop form seems necessary for the claimed speed.
  44 C Maybe load/store ordering affects it.
  45 C
  46 C The source and destination are checked to see if they're actually
  47 C overlapping, since it might be possible to use an incrementing rep movsl
  48 C at 0.75 c/l.  (It doesn't suffer the bad startup time of the decrementing
  49 C version.)
  50 C
  51 C Enhancements:
  52 C
  53 C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
  54 C one store each cycle.  Unrolling the loop below would approach 1.0, but
  55 C it'd be good to know why something like store/load/subl + store/load/jnz
  56 C doesn't already run at 1.0 c/l.  It looks like it should decode in 2
  57 C cycles, but doesn't run that way.
  58
  59 defframe(PARAM_SIZE,12)
  60 defframe(PARAM_SRC, 8)
  61 defframe(PARAM_DST, 4)
  62
  63 dnl  re-using parameter space
  64 define(SAVE_ESI,`PARAM_SIZE')
  65 define(SAVE_EDI,`PARAM_SRC')
  66
  67         TEXT
  68         ALIGN(16)
  69
  70 PROLOGUE(mpn_copyd)
  71 deflit(`FRAME',0)
  72
  73         movl    PARAM_SIZE, %ecx
  74
  75         movl    %esi, SAVE_ESI
  76         movl    PARAM_SRC, %esi
  77
  78         movl    %edi, SAVE_EDI
  79         movl    PARAM_DST, %edi
  80
  81         subl    $1, %ecx
  82         jb      L(zero)
  83
  84         movl    (%esi,%ecx,4), %eax             C src[size-1]
  85         jz      L(one)
  86
  87         movl    -4(%esi,%ecx,4), %edx           C src[size-2]
  88         subl    $2, %ecx
  89         jbe     L(done_loop)                    C 2 or 3 limbs only
  90
  91
  92         C The usual overlap is
  93         C
  94         C     high                   low
  95         C     +------------------+
  96         C     |               dst|
  97         C     +------------------+
  98         C           +------------------+
  99         C           |               src|
 100         C           +------------------+
 101         C
 102         C We can use an incrementing copy in the following circumstances.
 103         C
 104         C     src+4*size<=dst, since then the regions are disjoint
 105         C
 106         C     src==dst, clearly (though this shouldn't occur normally)
 107         C
 108         C     src>dst, since in that case it's a requirement of the
 109         C              parameters that src>=dst+size*4, and hence the
 110         C              regions are disjoint
 111         C
 112
 113         leal    (%edi,%ecx,4), %edx
 114         cmpl    %edi, %esi
 115         jae     L(use_movsl)            C src >= dst
 116
 117         cmpl    %edi, %edx
 118         movl    4(%esi,%ecx,4), %edx    C src[size-2] again
 119         jbe     L(use_movsl)            C src+4*size <= dst
 120
 121
 122 L(top):
 123         C eax   prev high limb
 124         C ebx
 125         C ecx   counter, size-3 down to 0 or -1, inclusive, by 2s
 126         C edx   prev low limb
 127         C esi   src
 128         C edi   dst
 129         C ebp
 130
 131         movl    %eax, 8(%edi,%ecx,4)
 132         movl    (%esi,%ecx,4), %eax
 133
 134         movl    %edx, 4(%edi,%ecx,4)
 135         movl    -4(%esi,%ecx,4), %edx
 136
 137         subl    $2, %ecx
 138         jnbe    L(top)
 139
 140
 141 L(done_loop):
 142         movl    %eax, 8(%edi,%ecx,4)
 143         movl    %edx, 4(%edi,%ecx,4)
 144
 145         C copy low limb (needed if size was odd, but will already have been
 146         C done in the loop if size was even)
 147         movl    (%esi), %eax
 148 L(one):
 149         movl    %eax, (%edi)
 150         movl    SAVE_EDI, %edi
 151         movl    SAVE_ESI, %esi
 152
 153         ret
 154
 155
 156 L(use_movsl):
 157         C eax
 158         C ebx
 159         C ecx   size-3
 160         C edx
 161         C esi   src
 162         C edi   dst
 163         C ebp
 164
 165         addl    $3, %ecx
 166
 167         cld             C better safe than sorry, see mpn/x86/README
 168
 169         rep
 170         movsl
 171
 172 L(zero):
 173         movl    SAVE_ESI, %esi
 174         movl    SAVE_EDI, %edi
 175
 176         ret
 177
 178 EPILOGUE()