source/libs/gmp/gmp-src/mpn/x86_64/core2/gcd_1.asm

   1 dnl  AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD.
   2
   3 dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
   4 dnl  Granlund.
   5
   6 dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
   7
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  11 dnl  it under the terms of either:
  12 dnl
  13 dnl    * the GNU Lesser General Public License as published by the Free
  14 dnl      Software Foundation; either version 3 of the License, or (at your
  15 dnl      option) any later version.
  16 dnl
  17 dnl  or
  18 dnl
  19 dnl    * the GNU General Public License as published by the Free Software
  20 dnl      Foundation; either version 2 of the License, or (at your option) any
  21 dnl      later version.
  22 dnl
  23 dnl  or both in parallel, as here.
  24 dnl
  25 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  26 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  27 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  28 dnl  for more details.
  29 dnl
  30 dnl  You should have received copies of the GNU General Public License and the
  31 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  32 dnl  see https://www.gnu.org/licenses/.
  33
  34 include(`../config.m4')
  35
  36
  37 C            cycles/bit (approx)
  38 C AMD K8,K9      8.50
  39 C AMD K10        4.30
  40 C AMD bd1        5.00
  41 C AMD bobcat    10.0
  42 C Intel P4      18.6
  43 C Intel core2    3.83
  44 C Intel NHM      5.17
  45 C Intel SBR      4.69
  46 C Intel atom    17.0
  47 C VIA nano       5.44
  48 C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
  49
  50 C TODO
  51 C  * Optimise inner-loop for specific CPUs.
  52 C  * Use DIV for 1-by-1 reductions, at least for some CPUs.
  53
  54 C Threshold of when to call bmod when U is one limb.  Should be about
  55 C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
  56 define(`BMOD_THRES_LOG2', 6)
  57
  58 C INPUT PARAMETERS
  59 define(`up',    `%rdi')
  60 define(`n',     `%rsi')
  61 define(`v0',    `%rdx')
  62
  63 ABI_SUPPORT(DOS64)
  64 ABI_SUPPORT(STD64)
  65
  66 IFDOS(`define(`STACK_ALLOC', 40)')
  67 IFSTD(`define(`STACK_ALLOC', 8)')
  68
  69 C Undo some configure cleverness.
  70 C The problem is that C only defines the '1c' variant, and that configure
  71 C therefore considers modexact_1c to be the base function.  It then adds a
  72 C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
  73 C gcd_1 exists without a corresponding cpudep mode1o.
  74 ifdef(`WANT_FAT_BINARY', `
  75   define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
  76
  77
  78 ASM_START()
  79         TEXT
  80         ALIGN(16)
  81 PROLOGUE(mpn_gcd_1)
  82         FUNC_ENTRY(3)
  83         mov     (up), %rax      C U low limb
  84         or      v0, %rax
  85         bsf     %rax, %rax      C min(ctz(u0),ctz(v0))
  86
  87         bsf     v0, %rcx
  88         shr     R8(%rcx), v0
  89
  90         push    %rax            C preserve common twos over call
  91         push    v0              C preserve v0 argument over call
  92         sub     $STACK_ALLOC, %rsp      C maintain ABI required rsp alignment
  93
  94         cmp     $1, n
  95         jnz     L(reduce_nby1)
  96
  97 C Both U and V are single limbs, reduce with bmod if u0 >> v0.
  98         mov     (up), %r8
  99         mov     %r8, %rax
 100         shr     $BMOD_THRES_LOG2, %r8
 101         cmp     %r8, v0
 102         ja      L(reduced)
 103         jmp     L(bmod)
 104
 105 L(reduce_nby1):
 106         cmp     $BMOD_1_TO_MOD_1_THRESHOLD, n
 107         jl      L(bmod)
 108 IFDOS(` mov     %rdx, %r8       ')
 109 IFDOS(` mov     %rsi, %rdx      ')
 110 IFDOS(` mov     %rdi, %rcx      ')
 111         ASSERT(nz, `test $15, %rsp')
 112         CALL(   mpn_mod_1)
 113         jmp     L(reduced)
 114 L(bmod):
 115 IFDOS(` mov     %rdx, %r8       ')
 116 IFDOS(` mov     %rsi, %rdx      ')
 117 IFDOS(` mov     %rdi, %rcx      ')
 118         ASSERT(nz, `test $15, %rsp')
 119         CALL(   mpn_modexact_1_odd)
 120 L(reduced):
 121
 122         add     $STACK_ALLOC, %rsp
 123         pop     %rdx
 124
 125         bsf     %rax, %rcx
 126 C       test    %rax, %rax      C FIXME: does this lower latency?
 127         jnz     L(mid)
 128         jmp     L(end)
 129
 130         ALIGN(16)               C               K10   BD    C2    NHM   SBR
 131 L(top): cmovc   %r10, %rax      C if x-y < 0    0,3   0,3   0,6   0,5   0,5
 132         cmovc   %r9, %rdx       C use x,y-x     0,3   0,3   2,8   1,7   1,7
 133 L(mid): shr     R8(%rcx), %rax  C               1,7   1,6   2,8   2,8   2,8
 134         mov     %rdx, %r10      C               1     1     4     3     3
 135         sub     %rax, %r10      C               2     2     5     4     4
 136         bsf     %r10, %rcx      C               3     3     6     5     5
 137         mov     %rax, %r9       C               2     2     3     3     4
 138         sub     %rdx, %rax      C               2     2     4     3     4
 139         jnz     L(top)          C
 140
 141 L(end): pop     %rcx
 142         mov     %rdx, %rax
 143         shl     R8(%rcx), %rax
 144         FUNC_EXIT()
 145         ret
 146 EPILOGUE()