source/libs/gmp/gmp-src/mpn/x86/sqr_basecase.asm

   1 dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31
  32 include(`../config.m4')
  33
  34
  35 C     cycles/crossproduct  cycles/triangleproduct
  36 C P5
  37 C P6
  38 C K6
  39 C K7
  40 C P4
  41
  42
  43 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  44 C
  45 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  46 C lot of function call overheads are avoided, especially when the size is
  47 C small.
  48 C
  49 C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
  50 C code size to do so here.
  51 C
  52 C Enhancements:
  53 C
  54 C The addmul loop here is also not unrolled like aorsmul_1.asm and
  55 C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
  56 C complexity, but if it's worth doing in the other places then it should be
  57 C worthwhile here.
  58 C
  59 C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
  60 C might be worth considering.  That'd add quite a bit to the code size, but
  61 C only as much as is used would be dragged into L1 cache.
  62
  63 defframe(PARAM_SIZE,12)
  64 defframe(PARAM_SRC, 8)
  65 defframe(PARAM_DST, 4)
  66
  67         TEXT
  68         ALIGN(8)
  69 PROLOGUE(mpn_sqr_basecase)
  70 deflit(`FRAME',0)
  71
  72         movl    PARAM_SIZE, %edx
  73
  74         movl    PARAM_SRC, %eax
  75
  76         cmpl    $2, %edx
  77         movl    PARAM_DST, %ecx
  78
  79         je      L(two_limbs)
  80         ja      L(three_or_more)
  81
  82
  83 C -----------------------------------------------------------------------------
  84 C one limb only
  85         C eax   src
  86         C ebx
  87         C ecx   dst
  88         C edx
  89
  90         movl    (%eax), %eax
  91         mull    %eax
  92         movl    %eax, (%ecx)
  93         movl    %edx, 4(%ecx)
  94         ret
  95
  96
  97 C -----------------------------------------------------------------------------
  98         ALIGN(8)
  99 L(two_limbs):
 100         C eax   src
 101         C ebx
 102         C ecx   dst
 103         C edx
 104
 105         pushl   %ebx
 106         pushl   %ebp
 107
 108         movl    %eax, %ebx
 109         movl    (%eax), %eax
 110
 111         mull    %eax            C src[0]^2
 112
 113         pushl   %esi
 114         pushl   %edi
 115
 116         movl    %edx, %esi      C dst[1]
 117         movl    %eax, (%ecx)    C dst[0]
 118
 119         movl    4(%ebx), %eax
 120         mull    %eax            C src[1]^2
 121
 122         movl    %eax, %edi      C dst[2]
 123         movl    %edx, %ebp      C dst[3]
 124
 125         movl    (%ebx), %eax
 126         mull    4(%ebx)         C src[0]*src[1]
 127
 128         addl    %eax, %esi
 129
 130         adcl    %edx, %edi
 131
 132         adcl    $0, %ebp
 133         addl    %esi, %eax
 134
 135         adcl    %edi, %edx
 136         movl    %eax, 4(%ecx)
 137
 138         adcl    $0, %ebp
 139
 140         movl    %edx, 8(%ecx)
 141         movl    %ebp, 12(%ecx)
 142
 143         popl    %edi
 144         popl    %esi
 145
 146         popl    %ebp
 147         popl    %ebx
 148
 149         ret
 150
 151
 152 C -----------------------------------------------------------------------------
 153         ALIGN(8)
 154 L(three_or_more):
 155 deflit(`FRAME',0)
 156         C eax   src
 157         C ebx
 158         C ecx   dst
 159         C edx   size
 160
 161         pushl   %ebx    FRAME_pushl()
 162         pushl   %edi    FRAME_pushl()
 163
 164         pushl   %esi    FRAME_pushl()
 165         pushl   %ebp    FRAME_pushl()
 166
 167         leal    (%ecx,%edx,4), %edi     C &dst[size], end of this mul1
 168         leal    (%eax,%edx,4), %esi     C &src[size]
 169
 170 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 171
 172         movl    (%eax), %ebp            C src[0], multiplier
 173         movl    %edx, %ecx
 174
 175         negl    %ecx                    C -size
 176         xorl    %ebx, %ebx              C clear carry limb
 177
 178         incl    %ecx                    C -(size-1)
 179
 180 L(mul1):
 181         C eax   scratch
 182         C ebx   carry
 183         C ecx   counter, limbs, negative
 184         C edx   scratch
 185         C esi   &src[size]
 186         C edi   &dst[size]
 187         C ebp   multiplier
 188
 189         movl    (%esi,%ecx,4), %eax
 190         mull    %ebp
 191         addl    %eax, %ebx
 192         adcl    $0, %edx
 193         movl    %ebx, (%edi,%ecx,4)
 194         movl    %edx, %ebx
 195         incl    %ecx
 196         jnz     L(mul1)
 197
 198         movl    %ebx, (%edi)
 199
 200
 201         C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
 202         C n=1..size-2.
 203         C
 204         C The last products src[size-2]*src[size-1], which is the end corner
 205         C of the product triangle, is handled separately at the end to save
 206         C looping overhead.  If size is 3 then it's only this that needs to
 207         C be done.
 208         C
 209         C In the outer loop %esi is a constant, and %edi just advances by 1
 210         C limb each time.  The size of the operation decreases by 1 limb
 211         C each time.
 212
 213         C eax
 214         C ebx   carry (needing carry flag added)
 215         C ecx
 216         C edx
 217         C esi   &src[size]
 218         C edi   &dst[size]
 219         C ebp
 220
 221         movl    PARAM_SIZE, %ecx
 222         subl    $3, %ecx
 223         jz      L(corner)
 224
 225         negl    %ecx
 226
 227 dnl  re-use parameter space
 228 define(VAR_OUTER,`PARAM_DST')
 229
 230 L(outer):
 231         C eax
 232         C ebx
 233         C ecx
 234         C edx   outer loop counter, -(size-3) to -1
 235         C esi   &src[size]
 236         C edi   dst, pointing at stored carry limb of previous loop
 237         C ebp
 238
 239         movl    %ecx, VAR_OUTER
 240         addl    $4, %edi                C advance dst end
 241
 242         movl    -8(%esi,%ecx,4), %ebp   C next multiplier
 243         subl    $1, %ecx
 244
 245         xorl    %ebx, %ebx              C initial carry limb
 246
 247 L(inner):
 248         C eax   scratch
 249         C ebx   carry (needing carry flag added)
 250         C ecx   counter, -n-1 to -1
 251         C edx   scratch
 252         C esi   &src[size]
 253         C edi   dst end of this addmul
 254         C ebp   multiplier
 255
 256         movl    (%esi,%ecx,4), %eax
 257         mull    %ebp
 258         addl    %ebx, %eax
 259         adcl    $0, %edx
 260         addl    %eax, (%edi,%ecx,4)
 261         adcl    $0, %edx
 262         movl    %edx, %ebx
 263         addl    $1, %ecx
 264         jl      L(inner)
 265
 266
 267         movl    %ebx, (%edi)
 268         movl    VAR_OUTER, %ecx
 269         incl    %ecx
 270         jnz     L(outer)
 271
 272
 273 L(corner):
 274         C esi   &src[size]
 275         C edi   &dst[2*size-3]
 276
 277         movl    -4(%esi), %eax
 278         mull    -8(%esi)                C src[size-1]*src[size-2]
 279         addl    %eax, 0(%edi)
 280         adcl    $0, %edx
 281         movl    %edx, 4(%edi)           C dst high limb
 282
 283
 284 C -----------------------------------------------------------------------------
 285 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 286
 287         movl    PARAM_SIZE, %eax
 288         negl    %eax
 289         addl    $1, %eax                C -(size-1) and clear carry
 290
 291 L(lshift):
 292         C eax   counter, negative
 293         C ebx   next limb
 294         C ecx
 295         C edx
 296         C esi
 297         C edi   &dst[2*size-4]
 298         C ebp
 299
 300         rcll    8(%edi,%eax,8)
 301         rcll    12(%edi,%eax,8)
 302         incl    %eax
 303         jnz     L(lshift)
 304
 305
 306         adcl    %eax, %eax              C high bit out
 307         movl    %eax, 8(%edi)           C dst most significant limb
 308
 309
 310 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
 311 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 312 C low limb of src[0]^2.
 313
 314         movl    PARAM_SRC, %esi
 315         movl    (%esi), %eax            C src[0]
 316         mull    %eax                    C src[0]^2
 317
 318         movl    PARAM_SIZE, %ecx
 319         leal    (%esi,%ecx,4), %esi     C src end
 320
 321         negl    %ecx                    C -size
 322         movl    %edx, %ebx              C initial carry
 323
 324         movl    %eax, 12(%edi,%ecx,8)   C dst[0]
 325         incl    %ecx                    C -(size-1)
 326
 327 L(diag):
 328         C eax   scratch (low product)
 329         C ebx   carry limb
 330         C ecx   counter, -(size-1) to -1
 331         C edx   scratch (high product)
 332         C esi   &src[size]
 333         C edi   &dst[2*size-3]
 334         C ebp   scratch (fetched dst limbs)
 335
 336         movl    (%esi,%ecx,4), %eax
 337         mull    %eax
 338
 339         addl    %ebx, 8(%edi,%ecx,8)
 340         movl    %edx, %ebx
 341
 342         adcl    %eax, 12(%edi,%ecx,8)
 343         adcl    $0, %ebx
 344
 345         incl    %ecx
 346         jnz     L(diag)
 347
 348
 349         addl    %ebx, 8(%edi)           C dst most significant limb
 350
 351         popl    %ebp
 352         popl    %esi
 353
 354         popl    %edi
 355         popl    %ebx
 356
 357         ret
 358
 359 EPILOGUE()