source/libs/gmp/gmp-src/mpn/x86/pentium4/sse2/rsh1add_n.asm

   1 dnl  Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
   2
   3 dnl  Copyright 2001-2004 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C        cycles/limb (approx)
  35 C      dst!=src1,2  dst==src1  dst==src2
  36 C P4:      4.5         6.5        6.5
  37
  38
  39 C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
  40 C                          mp_size_t size);
  41 C
  42 C The slightly strange combination of indexing and pointer incrementing
  43 C that's used seems to work best.  Not sure why, but for instance leal
  44 C incrementing on %esi is a 1 or 2 cycle slowdown.
  45 C
  46 C The dependent chain is paddq combining the carry and next (shifted) part,
  47 C plus psrlq to move the new carry down.  That, and just 4 mmx instructions
  48 C in total, makes 4 c/l the target speed, which is almost achieved for
  49 C separate src/dst but when src==dst the write combining anomalies slow it
  50 C down.
  51
  52 defframe(PARAM_SIZE, 16)
  53 defframe(PARAM_YP,   12)
  54 defframe(PARAM_XP,   8)
  55 defframe(PARAM_WP,   4)
  56
  57 dnl  re-use parameter space
  58 define(SAVE_EBX,`PARAM_XP')
  59 define(SAVE_ESI,`PARAM_YP')
  60
  61         TEXT
  62         ALIGN(8)
  63
  64 PROLOGUE(mpn_rsh1add_n)
  65 deflit(`FRAME',0)
  66
  67         movl    PARAM_XP, %edx
  68         movl    %ebx, SAVE_EBX
  69
  70         movl    PARAM_YP, %ebx
  71         movl    %esi, SAVE_ESI
  72
  73         movl    PARAM_WP, %esi
  74
  75         movd    (%edx), %mm0            C xp[0]
  76
  77         movd    (%ebx), %mm1            C yp[0]
  78         movl    PARAM_SIZE, %ecx
  79
  80         movl    (%edx), %eax            C xp[0]
  81
  82         addl    (%ebx), %eax            C xp[0]+yp[0]
  83
  84         paddq   %mm1, %mm0              C xp[0]+yp[0]
  85         leal    (%esi,%ecx,4), %esi     C wp end
  86         negl    %ecx                    C -size
  87
  88         psrlq   $1, %mm0                C (xp[0]+yp[0])/2
  89         and     $1, %eax                C return value, rsh1 bit of xp[0]+yp[0]
  90         addl    $1, %ecx                C -(size-1)
  91         jz      L(done)
  92
  93
  94 L(top):
  95         C eax   return value
  96         C ebx   yp end
  97         C ecx   counter, limbs, -(size-1) to -1 inclusive
  98         C edx   xp end
  99         C esi   wp end
 100         C mm0   carry (32 bits)
 101
 102         movd    4(%edx), %mm1   C xp[i+1]
 103         movd    4(%ebx), %mm2   C yp[i+1]
 104         leal    4(%edx), %edx
 105         leal    4(%ebx), %ebx
 106         paddq   %mm2, %mm1              C xp[i+1]+yp[i+1]
 107         psllq   $31, %mm1               C low bit at 31, further 32 above
 108
 109         paddq   %mm1, %mm0              C 31 and carry from prev add
 110         movd    %mm0, -4(%esi,%ecx,4)   C low ready to store dst[i]
 111
 112         psrlq   $32, %mm0               C high becomes new carry
 113
 114         addl    $1, %ecx
 115         jnz     L(top)
 116
 117
 118 L(done):
 119         movd    %mm0, -4(%esi)          C dst[size-1]
 120         movl    SAVE_EBX, %ebx
 121
 122         movl    SAVE_ESI, %esi
 123         emms
 124         ret
 125
 126 EPILOGUE()