source/libs/gmp/gmp-src/mpn/x86_64/coreinhm/aorsmul_1.asm

   1 dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb
  36 C AMD K8,K9
  37 C AMD K10
  38 C AMD bull
  39 C AMD pile
  40 C AMD bobcat
  41 C AMD jaguar
  42 C Intel P4
  43 C Intel core
  44 C Intel NHM      4.55  with minor fluctuations
  45 C Intel SBR
  46 C Intel IBR
  47 C Intel HWL
  48 C Intel BWL
  49 C Intel atom
  50 C VIA nano
  51
  52 C The loop of this code is the result of running a code generation and
  53 C optimization tool suite written by David Harvey and Torbjorn Granlund.
  54
  55 C N.B.: Be careful if editing, making sure the loop alignment padding does not
  56 C become large, as we currently fall into it.
  57
  58 define(`rp',      `%rdi')   C rcx
  59 define(`up',      `%rsi')   C rdx
  60 define(`n_param', `%rdx')   C r8
  61 define(`v0',      `%rcx')   C r9
  62
  63 define(`n',       `%rbx')
  64
  65 ifdef(`OPERATION_addmul_1',`
  66   define(`ADDSUB', `add')
  67   define(`func',   `mpn_addmul_1')
  68 ')
  69 ifdef(`OPERATION_submul_1',`
  70   define(`ADDSUB', `sub')
  71   define(`func',   `mpn_submul_1')
  72 ')
  73
  74 ABI_SUPPORT(DOS64)
  75 ABI_SUPPORT(STD64)
  76
  77 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
  78
  79 ASM_START()
  80         TEXT
  81         ALIGN(32)
  82 PROLOGUE(func)
  83         FUNC_ENTRY(4)
  84         push    %rbx
  85
  86         mov     (up), %rax
  87         lea     -8(up,n_param,8), up
  88         mov     (rp), %r8
  89         lea     -8(rp,n_param,8), rp
  90
  91         test    $1, R8(n_param)
  92         jnz     L(bx1)
  93
  94 L(bx0): test    $2, R8(n_param)
  95         jnz     L(b10)
  96
  97 L(b00): mov     $3, R32(n)
  98         sub     n_param, n
  99         mul     v0
 100         mov     $0, R32(%r11)
 101         mov     %r8, %r10
 102         ADDSUB  %rax, %r10
 103         mov     -8(up,n,8), %rax
 104         adc     %rdx, %r11
 105         jmp     L(lo0)
 106
 107 L(b10): mov     $1, R32(n)
 108         sub     n_param, n
 109         mul     v0
 110         mov     %r8, %r10
 111         mov     $0, R32(%r11)
 112         ADDSUB  %rax, %r10
 113         mov     8(up,n,8), %rax
 114         adc     %rdx, %r11
 115         jmp     L(lo2)
 116
 117 L(bx1): test    $2, R8(n_param)
 118         jz      L(b01)
 119
 120 L(b11): mov     $2, R32(n)
 121         sub     n_param, n
 122         mul     v0
 123         ADDSUB  %rax, %r8
 124         mov     $0, R32(%r9)
 125         mov     (up,n,8), %rax
 126         adc     %rdx, %r9
 127         jmp     L(lo3)
 128
 129 L(b01): mov     $0, R32(n)
 130         sub     n_param, n
 131         xor     %r11, %r11
 132         add     $4, n
 133         jc      L(end)
 134
 135         ALIGN(32)
 136 L(top): mul     v0
 137         ADDSUB  %rax, %r8
 138         mov     $0, R32(%r9)
 139         mov     -16(up,n,8), %rax
 140         adc     %rdx, %r9
 141 L(lo1): mul     v0
 142         ADDSUB  %r11, %r8
 143         mov     $0, R32(%r11)
 144         mov     -16(rp,n,8), %r10
 145         adc     $0, %r9
 146         ADDSUB  %rax, %r10
 147         mov     -8(up,n,8), %rax
 148         adc     %rdx, %r11
 149         mov     %r8, -24(rp,n,8)
 150         ADDSUB  %r9, %r10
 151         adc     $0, %r11
 152 L(lo0): mov     -8(rp,n,8), %r8
 153         mul     v0
 154         ADDSUB  %rax, %r8
 155         mov     $0, R32(%r9)
 156         mov     (up,n,8), %rax
 157         adc     %rdx, %r9
 158         mov     %r10, -16(rp,n,8)
 159         ADDSUB  %r11, %r8
 160         adc     $0, %r9
 161 L(lo3): mul     v0
 162         mov     (rp,n,8), %r10
 163         mov     $0, R32(%r11)
 164         ADDSUB  %rax, %r10
 165         mov     8(up,n,8), %rax
 166         adc     %rdx, %r11
 167         mov     %r8, -8(rp,n,8)
 168         ADDSUB  %r9, %r10
 169         adc     $0, %r11
 170 L(lo2): mov     8(rp,n,8), %r8
 171         mov     %r10, (rp,n,8)
 172         add     $4, n
 173         jnc     L(top)
 174
 175 L(end): mul     v0
 176         ADDSUB  %rax, %r8
 177         mov     $0, R32(%rax)
 178         adc     %rdx, %rax
 179         ADDSUB  %r11, %r8
 180         adc     $0, %rax
 181         mov     %r8, (rp)
 182
 183         pop     %rbx
 184         FUNC_EXIT()
 185         ret
 186 EPILOGUE()
 187 ASM_END()