source/libs/gmp/gmp-src/mpn/x86/pentium4/sse2/submul_1.asm

   1 dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
   2 dnl  subtract the result from a second limb vector.
   3
   4 dnl  Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34
  35 C                           cycles/limb
  36 C P6 model 0-8,10-12            -
  37 C P6 model 9   (Banias)         6.8
  38 C P6 model 13  (Dothan)         6.9
  39 C P4 model 0-1 (Willamette)     ?
  40 C P4 model 2   (Northwood)      5.87
  41 C P4 model 3-4 (Prescott)       6.5
  42
  43 C This code represents a step forwards compared to the code available before
  44 C GMP 5.1, but it is not carefully tuned for either P6 or P4.  In fact, it is
  45 C not good for P6.  For P4 it saved a bit over 1 c/l for both Northwood and
  46 C Prescott compared to the old code.
  47 C
  48 C The arrangements made here to get a two instruction dependent chain are
  49 C slightly subtle.  In the loop the carry (or borrow rather) is a negative so
  50 C that a paddq can be used to give a low limb ready to store, and a high limb
  51 C ready to become the new carry after a psrlq.
  52 C
  53 C If the carry was a simple twos complement negative then the psrlq shift would
  54 C need to bring in 0 bits or 1 bits according to whether the high was zero or
  55 C non-zero, since a non-zero value would represent a negative needing sign
  56 C extension.  That wouldn't be particularly easy to arrange and certainly would
  57 C add an instruction to the dependent chain, so instead an offset is applied so
  58 C that the high limb will be 0xFFFFFFFF+c.  With c in the range -0xFFFFFFFF to
  59 C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
  60 C always positive and can always have 0 bits shifted in, which is what psrlq
  61 C does.
  62 C
  63 C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
  64 C done off the dependent chain.  The total adjustment then is to add
  65 C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
  66 C to remove the offset from the current carry, for a net add of
  67 C 0xFFFFFFFE00000001.  In the code this is applied to the destination limb when
  68 C fetched.
  69 C
  70 C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
  71 C negative, which is how it's undone for the return value, but that doesn't
  72 C seem as clear.
  73
  74 defframe(PARAM_CARRY,     20)
  75 defframe(PARAM_MULTIPLIER,16)
  76 defframe(PARAM_SIZE,      12)
  77 defframe(PARAM_SRC,       8)
  78 defframe(PARAM_DST,       4)
  79
  80         TEXT
  81         ALIGN(16)
  82
  83 PROLOGUE(mpn_submul_1c)
  84 deflit(`FRAME',0)
  85         movd    PARAM_CARRY, %mm1
  86         jmp     L(start_1c)
  87 EPILOGUE()
  88
  89 PROLOGUE(mpn_submul_1)
  90 deflit(`FRAME',0)
  91         pxor    %mm1, %mm1              C initial borrow
  92
  93 L(start_1c):
  94         mov     PARAM_SRC, %eax
  95         pcmpeqd %mm0, %mm0
  96
  97         movd    PARAM_MULTIPLIER, %mm7
  98         pcmpeqd %mm6, %mm6
  99
 100         mov     PARAM_DST, %edx
 101         psrlq   $32, %mm0               C 0x00000000FFFFFFFF
 102
 103         mov     PARAM_SIZE, %ecx
 104         psllq   $32, %mm6               C 0xFFFFFFFF00000000
 105
 106         psubq   %mm0, %mm6              C 0xFFFFFFFE00000001
 107
 108         psubq   %mm1, %mm0              C 0xFFFFFFFF - borrow
 109
 110
 111         movd    (%eax), %mm3            C up
 112         movd    (%edx), %mm4            C rp
 113
 114         add     $-1, %ecx
 115         paddq   %mm6, %mm4              C add 0xFFFFFFFE00000001
 116         pmuludq %mm7, %mm3
 117         jnz     L(gt1)
 118         psubq   %mm3, %mm4              C prod
 119         paddq   %mm4, %mm0              C borrow
 120         movd    %mm0, (%edx)            C result
 121         jmp     L(rt)
 122
 123 L(gt1): movd    4(%eax), %mm1           C up
 124         movd    4(%edx), %mm2           C rp
 125
 126         add     $-1, %ecx
 127         jz      L(eev)
 128
 129         ALIGN(16)
 130 L(top): paddq   %mm6, %mm2              C add 0xFFFFFFFE00000001
 131         pmuludq %mm7, %mm1
 132         psubq   %mm3, %mm4              C prod
 133         movd    8(%eax), %mm3           C up
 134         paddq   %mm4, %mm0              C borrow
 135         movd    8(%edx), %mm4           C rp
 136         movd    %mm0, (%edx)            C result
 137         psrlq   $32, %mm0
 138
 139         add     $-1, %ecx
 140         jz      L(eod)
 141
 142         paddq   %mm6, %mm4              C add 0xFFFFFFFE00000001
 143         pmuludq %mm7, %mm3
 144         psubq   %mm1, %mm2              C prod
 145         movd    12(%eax), %mm1          C up
 146         paddq   %mm2, %mm0              C borrow
 147         movd    12(%edx), %mm2          C rp
 148         movd    %mm0, 4(%edx)           C result
 149         psrlq   $32, %mm0
 150
 151         lea     8(%eax), %eax
 152         lea     8(%edx), %edx
 153         add     $-1, %ecx
 154         jnz     L(top)
 155
 156
 157 L(eev): paddq   %mm6, %mm2              C add 0xFFFFFFFE00000001
 158         pmuludq %mm7, %mm1
 159         psubq   %mm3, %mm4              C prod
 160         paddq   %mm4, %mm0              C borrow
 161         movd    %mm0, (%edx)            C result
 162         psrlq   $32, %mm0
 163         psubq   %mm1, %mm2              C prod
 164         paddq   %mm2, %mm0              C borrow
 165         movd    %mm0, 4(%edx)           C result
 166 L(rt):  psrlq   $32, %mm0
 167         movd    %mm0, %eax
 168         not     %eax
 169         emms
 170         ret
 171
 172 L(eod): paddq   %mm6, %mm4              C add 0xFFFFFFFE00000001
 173         pmuludq %mm7, %mm3
 174         psubq   %mm1, %mm2              C prod
 175         paddq   %mm2, %mm0              C borrow
 176         movd    %mm0, 4(%edx)           C result
 177         psrlq   $32, %mm0
 178         psubq   %mm3, %mm4              C prod
 179         paddq   %mm4, %mm0              C borrow
 180         movd    %mm0, 8(%edx)           C result
 181         jmp     L(rt)
 182 EPILOGUE()