source/libs/gmp/gmp-src/mpn/x86/k7/mmx/copyi.asm

   1 dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
   2
   3 dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C    alignment dst/src, A=0mod8 N=4mod8
  35 C       A/A   A/N   N/A   N/N
  36 C K7    0.75  1.0   1.0   0.75
  37
  38
  39 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
  40 C
  41 C Copy src,size to dst,size.
  42 C
  43 C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
  44 C 1.33 c/l.
  45 C
  46 C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
  47 C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
  48 C under 0.7 c/l is known.  Apparently only two 32-bit stores can be done in
  49 C one cycle, so perhaps some scheduling is needed to ensure it's a
  50 C load+store in each cycle, not store+store.
  51 C
  52 C If both source and destination are unaligned then one limb is processed at
  53 C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
  54 C used unaligned it would be 1.5 c/l.
  55
  56 defframe(PARAM_SIZE,12)
  57 defframe(PARAM_SRC, 8)
  58 defframe(PARAM_DST, 4)
  59
  60 dnl  parameter space reused
  61 define(SAVE_EBX,`PARAM_SIZE')
  62
  63 dnl  minimum 5 since the unrolled code can't handle less than 5
  64 deflit(UNROLL_THRESHOLD, 5)
  65
  66         TEXT
  67         ALIGN(32)
  68 PROLOGUE(mpn_copyi)
  69 deflit(`FRAME',0)
  70
  71         movl    PARAM_SIZE, %ecx
  72         movl    %ebx, SAVE_EBX
  73
  74         movl    PARAM_SRC, %eax
  75         movl    PARAM_DST, %edx
  76
  77         cmpl    $UNROLL_THRESHOLD, %ecx
  78         jae     L(unroll)
  79
  80         orl     %ecx, %ecx
  81         jz      L(simple_done)
  82
  83 L(simple):
  84         C eax   src, incrementing
  85         C ebx   scratch
  86         C ecx   counter
  87         C edx   dst, incrementing
  88         C
  89         C this loop is 2 cycles/limb
  90
  91         movl    (%eax), %ebx
  92         movl    %ebx, (%edx)
  93         decl    %ecx
  94         leal    4(%eax), %eax
  95         leal    4(%edx), %edx
  96         jnz     L(simple)
  97
  98 L(simple_done):
  99         movl    SAVE_EBX, %ebx
 100         ret
 101
 102
 103 L(unroll):
 104         movl    %eax, %ebx
 105         leal    -12(%eax,%ecx,4), %eax  C src end - 12
 106         subl    $3, %ecx                C size-3
 107
 108         andl    %edx, %ebx
 109         leal    (%edx,%ecx,4), %edx     C dst end - 12
 110         negl    %ecx
 111
 112         testl   $4, %ebx   C testl to pad code closer to 16 bytes for L(top)
 113         jz      L(aligned)
 114
 115         C both src and dst unaligned, process one limb to align them
 116         movl    (%eax,%ecx,4), %ebx
 117         movl    %ebx, (%edx,%ecx,4)
 118         incl    %ecx
 119 L(aligned):
 120
 121
 122         ALIGN(16)
 123 L(top):
 124         C eax   src end - 12
 125         C ebx
 126         C ecx   counter, negative, limbs
 127         C edx   dst end - 12
 128
 129         movq    (%eax,%ecx,4), %mm0
 130         movq    8(%eax,%ecx,4), %mm1
 131         addl    $4, %ecx
 132         movq    %mm0, -16(%edx,%ecx,4)
 133         movq    %mm1, -16+8(%edx,%ecx,4)
 134         ja      L(top)          C jump no carry and not zero
 135
 136
 137         C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
 138
 139         testb   $2, %cl
 140         jnz     L(finish_not_two)
 141
 142         movq    (%eax,%ecx,4), %mm0
 143         movq    %mm0, (%edx,%ecx,4)
 144 L(finish_not_two):
 145
 146         testb   $1, %cl
 147         jnz     L(done)
 148
 149         movl    8(%eax), %ebx
 150         movl    %ebx, 8(%edx)
 151
 152 L(done):
 153         movl    SAVE_EBX, %ebx
 154         emms
 155         ret
 156
 157 EPILOGUE()