source/libs/gmp/gmp-src/mpn/x86/k6/gcd_1.asm

   1 dnl  AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
   2
   3 dnl  Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K6: 9.5 cycles/bit (approx)   1x1 gcd
  35 C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
  36
  37
  38 C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
  39 C
  40 C This code is nothing very special, but offers a speedup over what gcc 2.95
  41 C can do with mpn/generic/gcd_1.c.
  42 C
  43 C Future:
  44 C
  45 C Using a lookup table to count trailing zeros seems a touch quicker, but
  46 C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
  47 C it too.
  48
  49
  50 dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
  51 dnl  bigger than y, then a division x%y is done to reduce it.
  52 dnl
  53 dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
  54 dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
  55 dnl  what's found.
  56
  57 deflit(DIV_THRESHOLD, 5)
  58
  59
  60 defframe(PARAM_LIMB, 12)
  61 defframe(PARAM_SIZE,  8)
  62 defframe(PARAM_SRC,   4)
  63
  64         TEXT
  65         ALIGN(16)
  66
  67 PROLOGUE(mpn_gcd_1)
  68 deflit(`FRAME',0)
  69
  70         ASSERT(ne, `cmpl $0, PARAM_LIMB')
  71         ASSERT(ae, `cmpl $1, PARAM_SIZE')
  72
  73
  74         movl    PARAM_SRC, %eax
  75         pushl   %ebx                    FRAME_pushl()
  76
  77         movl    PARAM_LIMB, %edx
  78         movl    $-1, %ecx
  79
  80         movl    (%eax), %ebx            C src low limb
  81
  82         movl    %ebx, %eax              C src low limb
  83         orl     %edx, %ebx
  84
  85 L(common_twos):
  86         shrl    %ebx
  87         incl    %ecx
  88
  89         jnc     L(common_twos)          C 1/4 chance on random data
  90         shrl    %cl, %edx               C y
  91
  92         cmpl    $1, PARAM_SIZE
  93         ja      L(size_two_or_more)
  94
  95
  96         ASSERT(nz, `orl %eax, %eax')    C should have src limb != 0
  97
  98         shrl    %cl, %eax               C x
  99
 100
 101         C Swap if necessary to make x>=y.  Measures a touch quicker as a
 102         C jump than a branch free calculation.
 103         C
 104         C eax   x
 105         C ebx
 106         C ecx   common twos
 107         C edx   y
 108
 109         movl    %eax, %ebx
 110         cmpl    %eax, %edx
 111
 112         jb      L(noswap)
 113         movl    %edx, %eax
 114
 115         movl    %ebx, %edx
 116         movl    %eax, %ebx
 117 L(noswap):
 118
 119
 120         C See if it's worth reducing x with a divl.
 121         C
 122         C eax   x
 123         C ebx   x
 124         C ecx   common twos
 125         C edx   y
 126
 127         shrl    $DIV_THRESHOLD, %ebx
 128
 129         cmpl    %ebx, %edx
 130         ja      L(nodiv)
 131
 132
 133         C Reduce x to x%y.
 134         C
 135         C eax   x
 136         C ebx
 137         C ecx   common twos
 138         C edx   y
 139
 140         movl    %edx, %ebx
 141         xorl    %edx, %edx
 142
 143         divl    %ebx
 144
 145         orl     %edx, %edx      C y
 146         nop     C code alignment
 147
 148         movl    %ebx, %eax      C x
 149         jz      L(done_shll)
 150 L(nodiv):
 151
 152
 153         C eax   x
 154         C ebx
 155         C ecx   common twos
 156         C edx   y
 157         C esi
 158         C edi
 159         C ebp
 160
 161 L(strip_y):
 162         shrl    %edx
 163         jnc     L(strip_y)
 164
 165         leal    1(%edx,%edx), %edx
 166         movl    %ecx, %ebx      C common twos
 167
 168         leal    1(%eax), %ecx
 169         jmp     L(strip_x_and)
 170
 171
 172 C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
 173 C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
 174 C only 1/4.
 175 C
 176 C A second computed %cl shift was tried, but that measured a touch slower
 177 C than branching back.
 178 C
 179 C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
 180 C measured about 1 cycle/bit slower.
 181
 182         C eax   x
 183         C ebx   common twos
 184         C ecx   scratch
 185         C edx   y
 186
 187         ALIGN(4)
 188 L(swap):
 189         addl    %eax, %edx      C x-y+y = x
 190         negl    %eax            C -(x-y) = y-x
 191
 192 L(strip_x):
 193         shrl    %eax            C odd-odd = even, so always one to strip
 194         ASSERT(nz)
 195
 196 L(strip_x_leal):
 197         leal    1(%eax), %ecx
 198
 199 L(strip_x_and):
 200         andl    $1, %ecx        C (x^1)&1
 201
 202         shrl    %cl, %eax       C shift if x even
 203
 204         testb   $1, %al
 205         jz      L(strip_x)
 206
 207         ASSERT(nz,`testl $1, %eax')     C x, y odd
 208         ASSERT(nz,`testl $1, %edx')
 209
 210         subl    %edx, %eax
 211         jb      L(swap)
 212         ja      L(strip_x)
 213
 214
 215         movl    %edx, %eax
 216         movl    %ebx, %ecx
 217
 218 L(done_shll):
 219         shll    %cl, %eax
 220         popl    %ebx
 221
 222         ret
 223
 224
 225 C -----------------------------------------------------------------------------
 226 C Two or more limbs.
 227 C
 228 C x={src,size} is reduced modulo y using either a plain mod_1 style
 229 C remainder, or a modexact_1 style exact division.
 230
 231 deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
 232
 233         ALIGN(8)
 234 L(size_two_or_more):
 235         C eax
 236         C ebx
 237         C ecx   common twos
 238         C edx   y, without common twos
 239         C esi
 240         C edi
 241         C ebp
 242
 243 deflit(FRAME_TWO_OR_MORE, FRAME)
 244
 245         pushl   %edi            defframe_pushl(SAVE_EDI)
 246         movl    PARAM_SRC, %ebx
 247
 248 L(y_twos):
 249         shrl    %edx
 250         jnc     L(y_twos)
 251
 252         movl    %ecx, %edi              C common twos
 253         movl    PARAM_SIZE, %ecx
 254
 255         pushl   %esi            defframe_pushl(SAVE_ESI)
 256         leal    1(%edx,%edx), %esi      C y (odd)
 257
 258         movl    -4(%ebx,%ecx,4), %eax   C src high limb
 259
 260         cmpl    %edx, %eax              C carry if high<divisor
 261
 262         sbbl    %edx, %edx              C -1 if high<divisor
 263
 264         addl    %edx, %ecx              C skip one limb if high<divisor
 265         andl    %eax, %edx
 266
 267         cmpl    $MODEXACT_THRESHOLD, %ecx
 268         jae     L(modexact)
 269
 270
 271 L(divide_top):
 272         C eax   scratch (quotient)
 273         C ebx   src
 274         C ecx   counter, size-1 to 1
 275         C edx   carry (remainder)
 276         C esi   divisor (odd)
 277         C edi
 278         C ebp
 279
 280         movl    -4(%ebx,%ecx,4), %eax
 281         divl    %esi
 282         loop    L(divide_top)
 283
 284
 285         movl    %edx, %eax      C x
 286         movl    %esi, %edx      C y (odd)
 287
 288         movl    %edi, %ebx      C common twos
 289         popl    %esi
 290
 291         popl    %edi
 292         leal    1(%eax), %ecx
 293
 294         orl     %eax, %eax
 295         jnz     L(strip_x_and)
 296
 297
 298         movl    %ebx, %ecx
 299         movl    %edx, %eax
 300
 301         shll    %cl, %eax
 302         popl    %ebx
 303
 304         ret
 305
 306
 307         ALIGN(8)
 308 L(modexact):
 309         C eax
 310         C ebx   src ptr
 311         C ecx   size or size-1
 312         C edx
 313         C esi   y odd
 314         C edi   common twos
 315         C ebp
 316
 317         movl    PARAM_SIZE, %eax
 318         pushl   %esi            FRAME_pushl()
 319
 320         pushl   %eax            FRAME_pushl()
 321
 322         pushl   %ebx            FRAME_pushl()
 323
 324 ifdef(`PIC_WITH_EBX',`
 325         nop     C code alignment
 326         call    L(movl_eip_ebx)
 327         add     $_GLOBAL_OFFSET_TABLE_, %ebx
 328 ')
 329         CALL(   mpn_modexact_1_odd)
 330
 331         movl    %esi, %edx              C y odd
 332         movl    SAVE_ESI, %esi
 333
 334         movl    %edi, %ebx              C common twos
 335         movl    SAVE_EDI, %edi
 336
 337         addl    $eval(FRAME - FRAME_TWO_OR_MORE), %esp
 338         orl     %eax, %eax
 339
 340         leal    1(%eax), %ecx
 341         jnz     L(strip_x_and)
 342
 343
 344         movl    %ebx, %ecx
 345         movl    %edx, %eax
 346
 347         shll    %cl, %eax
 348         popl    %ebx
 349
 350         ret
 351
 352
 353 ifdef(`PIC_WITH_EBX',`
 354 L(movl_eip_ebx):
 355         movl    (%esp), %ebx
 356         ret_internal
 357 ')
 358
 359 EPILOGUE()