source/libs/gmp/gmp-src/mpn/x86_64/mod_34lsub1.asm

   1 dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
   2
   3 dnl  Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation,
   4 dnl  Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34
  35 C           cycles/limb
  36 C AMD K8,K9      0.67      0.583 is possible with zero-reg instead of $0, 4-way
  37 C AMD K10        0.67      this seems hard to beat
  38 C AMD bd1        1
  39 C AMD bobcat     1.07
  40 C Intel P4       7.35      terrible, use old code
  41 C Intel core2    1.25      1+epsilon with huge unrolling
  42 C Intel NHM      1.15      this seems hard to beat
  43 C Intel SBR      0.93
  44 C Intel atom     2.5
  45 C VIA nano       1.25      this seems hard to beat
  46
  47 C INPUT PARAMETERS
  48 define(`ap',    %rdi)
  49 define(`n',     %rsi)
  50
  51 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
  52
  53 C TODO
  54 C  * Review feed-in and wind-down code.
  55
  56 ABI_SUPPORT(DOS64)
  57 ABI_SUPPORT(STD64)
  58
  59 ASM_START()
  60         TEXT
  61         ALIGN(32)
  62 PROLOGUE(mpn_mod_34lsub1)
  63         FUNC_ENTRY(2)
  64
  65         mov     $0x0000FFFFFFFFFFFF, %r11
  66
  67         mov     (ap), %rax
  68
  69         cmp     $2, %rsi
  70         ja      L(gt2)
  71
  72         jb      L(one)
  73
  74         mov     8(ap), %rsi
  75         mov     %rax, %rdx
  76         shr     $48, %rax               C src[0] low
  77
  78         and     %r11, %rdx              C src[0] high
  79         add     %rdx, %rax
  80         mov     R32(%rsi), R32(%rdx)
  81
  82         shr     $32, %rsi               C src[1] high
  83         add     %rsi, %rax
  84
  85         shl     $16, %rdx               C src[1] low
  86         add     %rdx, %rax
  87 L(one): FUNC_EXIT()
  88         ret
  89
  90
  91 C Don't change this, the wind-down code is not able to handle greater values
  92 define(UNROLL,3)
  93
  94 L(gt2): mov     8(ap), %rcx
  95         mov     16(ap), %rdx
  96         xor     %r9, %r9
  97         add     $24, ap
  98         sub     $eval(UNROLL*3+3), %rsi
  99         jc      L(end)
 100         ALIGN(16)
 101 L(top):
 102         add     (ap), %rax
 103         adc     8(ap), %rcx
 104         adc     16(ap), %rdx
 105         adc     $0, %r9
 106 forloop(i,1,UNROLL-1,`dnl
 107         add     eval(i*24)(ap), %rax
 108         adc     eval(i*24+8)(ap), %rcx
 109         adc     eval(i*24+16)(ap), %rdx
 110         adc     $0, %r9
 111 ')dnl
 112         add     $eval(UNROLL*24), ap
 113         sub     $eval(UNROLL*3), %rsi
 114         jnc     L(top)
 115
 116 L(end):
 117         lea     L(tab)(%rip), %r8
 118 ifdef(`PIC',
 119 `       movslq  36(%r8,%rsi,4), %r10
 120         add     %r10, %r8
 121         jmp     *%r8
 122 ',`
 123         jmp     *72(%r8,%rsi,8)
 124 ')
 125         JUMPTABSECT
 126         ALIGN(8)
 127 L(tab): JMPENT( L(0), L(tab))
 128         JMPENT( L(1), L(tab))
 129         JMPENT( L(2), L(tab))
 130         JMPENT( L(3), L(tab))
 131         JMPENT( L(4), L(tab))
 132         JMPENT( L(5), L(tab))
 133         JMPENT( L(6), L(tab))
 134         JMPENT( L(7), L(tab))
 135         JMPENT( L(8), L(tab))
 136         TEXT
 137
 138 L(6):   add     (ap), %rax
 139         adc     8(ap), %rcx
 140         adc     16(ap), %rdx
 141         adc     $0, %r9
 142         add     $24, ap
 143 L(3):   add     (ap), %rax
 144         adc     8(ap), %rcx
 145         adc     16(ap), %rdx
 146         jmp     L(cj1)
 147
 148 L(7):   add     (ap), %rax
 149         adc     8(ap), %rcx
 150         adc     16(ap), %rdx
 151         adc     $0, %r9
 152         add     $24, ap
 153 L(4):   add     (ap), %rax
 154         adc     8(ap), %rcx
 155         adc     16(ap), %rdx
 156         adc     $0, %r9
 157         add     $24, ap
 158 L(1):   add     (ap), %rax
 159         adc     $0, %rcx
 160         jmp     L(cj2)
 161
 162 L(8):   add     (ap), %rax
 163         adc     8(ap), %rcx
 164         adc     16(ap), %rdx
 165         adc     $0, %r9
 166         add     $24, ap
 167 L(5):   add     (ap), %rax
 168         adc     8(ap), %rcx
 169         adc     16(ap), %rdx
 170         adc     $0, %r9
 171         add     $24, ap
 172 L(2):   add     (ap), %rax
 173         adc     8(ap), %rcx
 174
 175 L(cj2): adc     $0, %rdx
 176 L(cj1): adc     $0, %r9
 177 L(0):   add     %r9, %rax
 178         adc     $0, %rcx
 179         adc     $0, %rdx
 180         adc     $0, %rax
 181
 182         mov     %rax, %rdi              C 0mod3
 183         shr     $48, %rax               C 0mod3 high
 184
 185         and     %r11, %rdi              C 0mod3 low
 186         mov     R32(%rcx), R32(%r10)    C 1mod3
 187
 188         shr     $32, %rcx               C 1mod3 high
 189
 190         add     %rdi, %rax              C apply 0mod3 low
 191         movzwl  %dx, R32(%rdi)          C 2mod3
 192         shl     $16, %r10               C 1mod3 low
 193
 194         add     %rcx, %rax              C apply 1mod3 high
 195         shr     $16, %rdx               C 2mod3 high
 196
 197         add     %r10, %rax              C apply 1mod3 low
 198         shl     $32, %rdi               C 2mod3 low
 199
 200         add     %rdx, %rax              C apply 2mod3 high
 201         add     %rdi, %rax              C apply 2mod3 low
 202
 203         FUNC_EXIT()
 204         ret
 205 EPILOGUE()