source/libs/gmp/gmp-src/mpn/x86_64/mod_1_4.asm

   1 dnl  AMD64 mpn_mod_1s_4p
   2
   3 dnl  Contributed to the GNU project by Torbjorn Granlund.
   4
   5 dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb
  36 C AMD K8,K9      3
  37 C AMD K10        3
  38 C Intel P4      15.5
  39 C Intel core2    5
  40 C Intel corei    4
  41 C Intel atom    23
  42 C VIA nano       4.75
  43
  44 ABI_SUPPORT(DOS64)
  45 ABI_SUPPORT(STD64)
  46
  47 ASM_START()
  48         TEXT
  49         ALIGN(16)
  50 PROLOGUE(mpn_mod_1s_4p)
  51         FUNC_ENTRY(4)
  52         push    %r15
  53         push    %r14
  54         push    %r13
  55         push    %r12
  56         push    %rbp
  57         push    %rbx
  58
  59         mov     %rdx, %r15
  60         mov     %rcx, %r14
  61         mov     16(%rcx), %r11          C B1modb
  62         mov     24(%rcx), %rbx          C B2modb
  63         mov     32(%rcx), %rbp          C B3modb
  64         mov     40(%rcx), %r13          C B4modb
  65         mov     48(%rcx), %r12          C B5modb
  66         xor     R32(%r8), R32(%r8)
  67         mov     R32(%rsi), R32(%rdx)
  68         and     $3, R32(%rdx)
  69         je      L(b0)
  70         cmp     $2, R32(%rdx)
  71         jc      L(b1)
  72         je      L(b2)
  73
  74 L(b3):  lea     -24(%rdi,%rsi,8), %rdi
  75         mov     8(%rdi), %rax
  76         mul     %r11
  77         mov     (%rdi), %r9
  78         add     %rax, %r9
  79         adc     %rdx, %r8
  80         mov     16(%rdi), %rax
  81         mul     %rbx
  82         jmp     L(m0)
  83
  84         ALIGN(8)
  85 L(b0):  lea     -32(%rdi,%rsi,8), %rdi
  86         mov     8(%rdi), %rax
  87         mul     %r11
  88         mov     (%rdi), %r9
  89         add     %rax, %r9
  90         adc     %rdx, %r8
  91         mov     16(%rdi), %rax
  92         mul     %rbx
  93         add     %rax, %r9
  94         adc     %rdx, %r8
  95         mov     24(%rdi), %rax
  96         mul     %rbp
  97         jmp     L(m0)
  98
  99         ALIGN(8)
 100 L(b1):  lea     -8(%rdi,%rsi,8), %rdi
 101         mov     (%rdi), %r9
 102         jmp     L(m1)
 103
 104         ALIGN(8)
 105 L(b2):  lea     -16(%rdi,%rsi,8), %rdi
 106         mov     8(%rdi), %r8
 107         mov     (%rdi), %r9
 108         jmp     L(m1)
 109
 110         ALIGN(16)
 111 L(top): mov     -24(%rdi), %rax
 112         mov     -32(%rdi), %r10
 113         mul     %r11                    C up[1] * B1modb
 114         add     %rax, %r10
 115         mov     -16(%rdi), %rax
 116         mov     $0, R32(%rcx)
 117         adc     %rdx, %rcx
 118         mul     %rbx                    C up[2] * B2modb
 119         add     %rax, %r10
 120         mov     -8(%rdi), %rax
 121         adc     %rdx, %rcx
 122         sub     $32, %rdi
 123         mul     %rbp                    C up[3] * B3modb
 124         add     %rax, %r10
 125         mov     %r13, %rax
 126         adc     %rdx, %rcx
 127         mul     %r9                     C rl * B4modb
 128         add     %rax, %r10
 129         mov     %r12, %rax
 130         adc     %rdx, %rcx
 131         mul     %r8                     C rh * B5modb
 132         mov     %r10, %r9
 133         mov     %rcx, %r8
 134 L(m0):  add     %rax, %r9
 135         adc     %rdx, %r8
 136 L(m1):  sub     $4, %rsi
 137         ja      L(top)
 138
 139 L(end): mov     8(%r14), R32(%rsi)
 140         mov     %r8, %rax
 141         mul     %r11
 142         mov     %rax, %r8
 143         add     %r9, %r8
 144         adc     $0, %rdx
 145         xor     R32(%rcx), R32(%rcx)
 146         sub     R32(%rsi), R32(%rcx)
 147         mov     %r8, %rdi
 148         shr     R8(%rcx), %rdi
 149         mov     R32(%rsi), R32(%rcx)
 150         sal     R8(%rcx), %rdx
 151         or      %rdx, %rdi
 152         mov     %rdi, %rax
 153         mulq    (%r14)
 154         mov     %r15, %rbx
 155         mov     %rax, %r9
 156         sal     R8(%rcx), %r8
 157         inc     %rdi
 158         add     %r8, %r9
 159         adc     %rdi, %rdx
 160         imul    %rbx, %rdx
 161         sub     %rdx, %r8
 162         lea     (%r8,%rbx), %rax
 163         cmp     %r8, %r9
 164         cmovc   %rax, %r8
 165         mov     %r8, %rax
 166         sub     %rbx, %rax
 167         cmovc   %r8, %rax
 168         shr     R8(%rcx), %rax
 169         pop     %rbx
 170         pop     %rbp
 171         pop     %r12
 172         pop     %r13
 173         pop     %r14
 174         pop     %r15
 175         FUNC_EXIT()
 176         ret
 177 EPILOGUE()
 178
 179         ALIGN(16)
 180 PROLOGUE(mpn_mod_1s_4p_cps)
 181         FUNC_ENTRY(2)
 182         push    %rbp
 183         bsr     %rsi, %rcx
 184         push    %rbx
 185         mov     %rdi, %rbx
 186         push    %r12
 187         xor     $63, R32(%rcx)
 188         mov     %rsi, %r12
 189         mov     R32(%rcx), R32(%rbp)    C preserve cnt over call
 190         sal     R8(%rcx), %r12          C b << cnt
 191 IFSTD(` mov     %r12, %rdi      ')      C pass parameter
 192 IFDOS(` mov     %r12, %rcx      ')      C pass parameter
 193         ASSERT(nz, `test $15, %rsp')
 194         CALL(   mpn_invert_limb)
 195         mov     %r12, %r8
 196         mov     %rax, %r11
 197         mov     %rax, (%rbx)            C store bi
 198         mov     %rbp, 8(%rbx)           C store cnt
 199         neg     %r8
 200         mov     R32(%rbp), R32(%rcx)
 201         mov     $1, R32(%rsi)
 202 ifdef(`SHLD_SLOW',`
 203         shl     R8(%rcx), %rsi
 204         neg     R32(%rcx)
 205         mov     %rax, %rbp
 206         shr     R8(%rcx), %rax
 207         or      %rax, %rsi
 208         mov     %rbp, %rax
 209         neg     R32(%rcx)
 210 ',`
 211         shld    R8(%rcx), %rax, %rsi    C FIXME: Slow on Atom and Nano
 212 ')
 213         imul    %r8, %rsi
 214         mul     %rsi
 215
 216         add     %rsi, %rdx
 217         shr     R8(%rcx), %rsi
 218         mov     %rsi, 16(%rbx)          C store B1modb
 219
 220         not     %rdx
 221         imul    %r12, %rdx
 222         lea     (%rdx,%r12), %rsi
 223         cmp     %rdx, %rax
 224         cmovnc  %rdx, %rsi
 225         mov     %r11, %rax
 226         mul     %rsi
 227
 228         add     %rsi, %rdx
 229         shr     R8(%rcx), %rsi
 230         mov     %rsi, 24(%rbx)          C store B2modb
 231
 232         not     %rdx
 233         imul    %r12, %rdx
 234         lea     (%rdx,%r12), %rsi
 235         cmp     %rdx, %rax
 236         cmovnc  %rdx, %rsi
 237         mov     %r11, %rax
 238         mul     %rsi
 239
 240         add     %rsi, %rdx
 241         shr     R8(%rcx), %rsi
 242         mov     %rsi, 32(%rbx)          C store B3modb
 243
 244         not     %rdx
 245         imul    %r12, %rdx
 246         lea     (%rdx,%r12), %rsi
 247         cmp     %rdx, %rax
 248         cmovnc  %rdx, %rsi
 249         mov     %r11, %rax
 250         mul     %rsi
 251
 252         add     %rsi, %rdx
 253         shr     R8(%rcx), %rsi
 254         mov     %rsi, 40(%rbx)          C store B4modb
 255
 256         not     %rdx
 257         imul    %r12, %rdx
 258         add     %rdx, %r12
 259         cmp     %rdx, %rax
 260         cmovnc  %rdx, %r12
 261
 262         shr     R8(%rcx), %r12
 263         mov     %r12, 48(%rbx)          C store B5modb
 264
 265         pop     %r12
 266         pop     %rbx
 267         pop     %rbp
 268         FUNC_EXIT()
 269         ret
 270 EPILOGUE()