source/libs/gmp/gmp-src/mpn/x86/pentium/mmx/mul_1.asm

   1 dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
   2
   3 dnl  Copyright 2000-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C    cycles/limb
  35 C P5:   12.0   for 32-bit multiplier
  36 C        7.0   for 16-bit multiplier
  37
  38
  39 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  40 C                      mp_limb_t multiplier);
  41 C
  42 C When the multiplier is 16 bits some special case MMX code is used.  Small
  43 C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
  44 C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
  45 C size==8 end up being quite close.  If src isn't aligned to an 8 byte
  46 C boundary then one limb is processed separately with roughly a 5 cycle
  47 C penalty, so in that case it's say size==8 and size==9 which are close.
  48 C
  49 C Alternatives:
  50 C
  51 C MMX is not believed to be of any use for 32-bit multipliers, since for
  52 C instance the current method would just have to be more or less duplicated
  53 C for the high and low halves of the multiplier, and would probably
  54 C therefore run at about 14 cycles, which is slower than the plain integer
  55 C at 12.
  56 C
  57 C Adding the high and low MMX products using integer code seems best.  An
  58 C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
  59 C any joy.  Perhaps something could be done keeping the values signed and
  60 C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
  61 C perhaps not.
  62 C
  63 C Future:
  64 C
  65 C An mpn_mul_1c entrypoint would need a double carry out of the low result
  66 C limb in the 16-bit code, unless it could be assumed the carry fits in 16
  67 C bits, possibly as carry<multiplier, this being true of a big calculation
  68 C done piece by piece.  But let's worry about that if/when mul_1c is
  69 C actually used.
  70
  71 defframe(PARAM_MULTIPLIER,16)
  72 defframe(PARAM_SIZE,      12)
  73 defframe(PARAM_SRC,       8)
  74 defframe(PARAM_DST,       4)
  75
  76         TEXT
  77
  78         ALIGN(8)
  79 PROLOGUE(mpn_mul_1)
  80 deflit(`FRAME',0)
  81
  82         movl    PARAM_SIZE, %ecx
  83         movl    PARAM_SRC, %edx
  84
  85         cmpl    $1, %ecx
  86         jne     L(two_or_more)
  87
  88         C one limb only
  89
  90         movl    PARAM_MULTIPLIER, %eax
  91         movl    PARAM_DST, %ecx
  92
  93         mull    (%edx)
  94
  95         movl    %eax, (%ecx)
  96         movl    %edx, %eax
  97
  98         ret
  99
 100
 101 L(two_or_more):
 102         C eax   size
 103         C ebx
 104         C ecx   carry
 105         C edx
 106         C esi   src
 107         C edi
 108         C ebp
 109
 110         pushl   %esi            FRAME_pushl()
 111         pushl   %edi            FRAME_pushl()
 112
 113         movl    %edx, %esi              C src
 114         movl    PARAM_DST, %edi
 115
 116         movl    PARAM_MULTIPLIER, %eax
 117         pushl   %ebx            FRAME_pushl()
 118
 119         leal    (%esi,%ecx,4), %esi     C src end
 120         leal    (%edi,%ecx,4), %edi     C dst end
 121
 122         negl    %ecx                    C -size
 123
 124         pushl   %ebp            FRAME_pushl()
 125         cmpl    $65536, %eax
 126
 127         jb      L(small)
 128
 129
 130 L(big):
 131         xorl    %ebx, %ebx              C carry limb
 132         sarl    %ecx                    C -size/2
 133
 134         jnc     L(top)                  C with carry flag clear
 135
 136
 137         C size was odd, process one limb separately
 138
 139         mull    4(%esi,%ecx,8)          C m * src[0]
 140
 141         movl    %eax, 4(%edi,%ecx,8)
 142         incl    %ecx
 143
 144         orl     %edx, %ebx              C carry limb, and clear carry flag
 145
 146
 147 L(top):
 148         C eax
 149         C ebx   carry
 150         C ecx   counter, negative
 151         C edx
 152         C esi   src end
 153         C edi   dst end
 154         C ebp   (scratch carry)
 155
 156         adcl    $0, %ebx
 157         movl    (%esi,%ecx,8), %eax
 158
 159         mull    PARAM_MULTIPLIER
 160
 161         movl    %edx, %ebp
 162         addl    %eax, %ebx
 163
 164         adcl    $0, %ebp
 165         movl    4(%esi,%ecx,8), %eax
 166
 167         mull    PARAM_MULTIPLIER
 168
 169         movl    %ebx, (%edi,%ecx,8)
 170         addl    %ebp, %eax
 171
 172         movl    %eax, 4(%edi,%ecx,8)
 173         incl    %ecx
 174
 175         movl    %edx, %ebx
 176         jnz     L(top)
 177
 178
 179         adcl    $0, %ebx
 180         popl    %ebp
 181
 182         movl    %ebx, %eax
 183         popl    %ebx
 184
 185         popl    %edi
 186         popl    %esi
 187
 188         ret
 189
 190
 191 L(small):
 192         C Special case for 16-bit multiplier.
 193         C
 194         C eax   multiplier
 195         C ebx
 196         C ecx   -size
 197         C edx   src
 198         C esi   src end
 199         C edi   dst end
 200         C ebp   multiplier
 201
 202         C size<3 not supported here.  At size==3 we're already a couple of
 203         C cycles faster, so there's no threshold as such, just use the MMX
 204         C as soon as possible.
 205
 206         cmpl    $-3, %ecx
 207         ja      L(big)
 208
 209         movd    %eax, %mm7              C m
 210         pxor    %mm6, %mm6              C initial carry word
 211
 212         punpcklwd %mm7, %mm7            C m replicated 2 times
 213         addl    $2, %ecx                C -size+2
 214
 215         punpckldq %mm7, %mm7            C m replicated 4 times
 216         andl    $4, %edx                C test alignment, clear carry flag
 217
 218         movq    %mm7, %mm0              C m
 219         jz      L(small_entry)
 220
 221
 222         C Source is unaligned, process one limb separately.
 223         C
 224         C Plain integer code is used here, since it's smaller and is about
 225         C the same 13 cycles as an mmx block would be.
 226         C
 227         C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
 228         C the use of separate incl and orl.
 229
 230         mull    -8(%esi,%ecx,4)         C m * src[0]
 231
 232         movl    %eax, -8(%edi,%ecx,4)   C dst[0]
 233         incl    %ecx                    C one limb processed
 234
 235         movd    %edx, %mm6              C initial carry
 236
 237         orl     %eax, %eax              C clear carry flag
 238         jmp     L(small_entry)
 239
 240
 241 C The scheduling here is quite tricky, since so many instructions have
 242 C pairing restrictions.  In particular the js won't pair with a movd, and
 243 C can't be paired with an adc since it wants flags from the inc, so
 244 C instructions are rotated to the top of the loop to find somewhere useful
 245 C for it.
 246 C
 247 C Trouble has been taken to avoid overlapping successive loop iterations,
 248 C since that would greatly increase the size of the startup and finishup
 249 C code.  Actually there's probably not much advantage to be had from
 250 C overlapping anyway, since the difficulties are mostly with pairing, not
 251 C with latencies as such.
 252 C
 253 C In the comments x represents the src data and m the multiplier (16
 254 C bits, but replicated 4 times).
 255 C
 256 C The m signs calculated in %mm3 are a loop invariant and could be held in
 257 C say %mm5, but that would save only one instruction and hence be no faster.
 258
 259 L(small_top):
 260         C eax   l.low, then l.high
 261         C ebx   (h.low)
 262         C ecx   counter, -size+2 to 0 or 1
 263         C edx   (h.high)
 264         C esi   &src[size]
 265         C edi   &dst[size]
 266         C ebp
 267         C
 268         C %mm0  (high products)
 269         C %mm1  (low products)
 270         C %mm2  (adjust for m using x signs)
 271         C %mm3  (adjust for x using m signs)
 272         C %mm4
 273         C %mm5
 274         C %mm6  h.low, then carry
 275         C %mm7  m replicated 4 times
 276
 277         movd    %mm6, %ebx              C h.low
 278         psrlq   $32, %mm1               C l.high
 279
 280         movd    %mm0, %edx              C h.high
 281         movq    %mm0, %mm6              C new c
 282
 283         adcl    %eax, %ebx
 284         incl    %ecx
 285
 286         movd    %mm1, %eax              C l.high
 287         movq    %mm7, %mm0
 288
 289         adcl    %eax, %edx
 290         movl    %ebx, -16(%edi,%ecx,4)
 291
 292         movl    %edx, -12(%edi,%ecx,4)
 293         psrlq   $32, %mm6               C c
 294
 295 L(small_entry):
 296         pmulhw  -8(%esi,%ecx,4), %mm0   C h = (x*m).high
 297         movq    %mm7, %mm1
 298
 299         pmullw  -8(%esi,%ecx,4), %mm1   C l = (x*m).low
 300         movq    %mm7, %mm3
 301
 302         movq    -8(%esi,%ecx,4), %mm2   C x
 303         psraw   $15, %mm3               C m signs
 304
 305         pand    -8(%esi,%ecx,4), %mm3   C x selected by m signs
 306         psraw   $15, %mm2               C x signs
 307
 308         paddw   %mm3, %mm0              C add x to h if m neg
 309         pand    %mm7, %mm2              C m selected by x signs
 310
 311         paddw   %mm2, %mm0              C add m to h if x neg
 312         incl    %ecx
 313
 314         movd    %mm1, %eax              C l.low
 315         punpcklwd %mm0, %mm6            C c + h.low << 16
 316
 317         psrlq   $16, %mm0               C h.high
 318         js      L(small_top)
 319
 320
 321
 322
 323         movd    %mm6, %ebx              C h.low
 324         psrlq   $32, %mm1               C l.high
 325
 326         adcl    %eax, %ebx
 327         popl    %ebp            FRAME_popl()
 328
 329         movd    %mm0, %edx              C h.high
 330         psrlq   $32, %mm0               C l.high
 331
 332         movd    %mm1, %eax              C l.high
 333
 334         adcl    %eax, %edx
 335         movl    %ebx, -12(%edi,%ecx,4)
 336
 337         movd    %mm0, %eax              C c
 338
 339         adcl    $0, %eax
 340         movl    %edx, -8(%edi,%ecx,4)
 341
 342         orl     %ecx, %ecx
 343         jnz     L(small_done)           C final %ecx==1 means even, ==0 odd
 344
 345
 346         C Size odd, one extra limb to process.
 347         C Plain integer code is used here, since it's smaller and is about
 348         C the same speed as another mmx block would be.
 349
 350         movl    %eax, %ecx
 351         movl    PARAM_MULTIPLIER, %eax
 352
 353         mull    -4(%esi)
 354
 355         addl    %ecx, %eax
 356
 357         adcl    $0, %edx
 358         movl    %eax, -4(%edi)
 359
 360         movl    %edx, %eax
 361 L(small_done):
 362         popl    %ebx
 363
 364         popl    %edi
 365         popl    %esi
 366
 367         emms
 368
 369         ret
 370
 371 EPILOGUE()