source/libs/gmp/gmp-src/mpn/x86_64/dive_1.asm

   1 dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C            cycles/limb
  35 C AMD K8,K9     10
  36 C AMD K10       10
  37 C Intel P4      33
  38 C Intel core2   13.25
  39 C Intel corei   14
  40 C Intel atom    42
  41 C VIA nano      43
  42
  43 C A quick adoption of the 32-bit K7 code.
  44
  45
  46 C INPUT PARAMETERS
  47 C rp            rdi
  48 C up            rsi
  49 C n             rdx
  50 C divisor       rcx
  51
  52 ABI_SUPPORT(DOS64)
  53 ABI_SUPPORT(STD64)
  54
  55 ASM_START()
  56         TEXT
  57         ALIGN(16)
  58 PROLOGUE(mpn_divexact_1)
  59         FUNC_ENTRY(4)
  60         push    %rbx
  61
  62         mov     %rcx, %rax
  63         xor     R32(%rcx), R32(%rcx)    C shift count
  64         mov     %rdx, %r8
  65
  66         bt      $0, R32(%rax)
  67         jnc     L(evn)                  C skip bsfq unless divisor is even
  68
  69 L(odd): mov     %rax, %rbx
  70         shr     R32(%rax)
  71         and     $127, R32(%rax)         C d/2, 7 bits
  72
  73         LEA(    binvert_limb_table, %rdx)
  74
  75         movzbl  (%rdx,%rax), R32(%rax)  C inv 8 bits
  76
  77         mov     %rbx, %r11              C d without twos
  78
  79         lea     (%rax,%rax), R32(%rdx)  C 2*inv
  80         imul    R32(%rax), R32(%rax)    C inv*inv
  81         imul    R32(%rbx), R32(%rax)    C inv*inv*d
  82         sub     R32(%rax), R32(%rdx)    C inv = 2*inv - inv*inv*d, 16 bits
  83
  84         lea     (%rdx,%rdx), R32(%rax)  C 2*inv
  85         imul    R32(%rdx), R32(%rdx)    C inv*inv
  86         imul    R32(%rbx), R32(%rdx)    C inv*inv*d
  87         sub     R32(%rdx), R32(%rax)    C inv = 2*inv - inv*inv*d, 32 bits
  88
  89         lea     (%rax,%rax), %r10       C 2*inv
  90         imul    %rax, %rax              C inv*inv
  91         imul    %rbx, %rax              C inv*inv*d
  92         sub     %rax, %r10              C inv = 2*inv - inv*inv*d, 64 bits
  93
  94         lea     (%rsi,%r8,8), %rsi      C up end
  95         lea     -8(%rdi,%r8,8), %rdi    C rp end
  96         neg     %r8                     C -n
  97
  98         mov     (%rsi,%r8,8), %rax      C up[0]
  99
 100         inc     %r8
 101         jz      L(one)
 102
 103         mov     (%rsi,%r8,8), %rdx      C up[1]
 104
 105         shrd    R8(%rcx), %rdx, %rax
 106
 107         xor     R32(%rbx), R32(%rbx)
 108         jmp     L(ent)
 109
 110 L(evn): bsf     %rax, %rcx
 111         shr     R8(%rcx), %rax
 112         jmp     L(odd)
 113
 114         ALIGN(8)
 115 L(top):
 116         C rax   q
 117         C rbx   carry bit, 0 or 1
 118         C rcx   shift
 119         C rdx
 120         C rsi   up end
 121         C rdi   rp end
 122         C r8    counter, limbs, negative
 123         C r10   d^(-1) mod 2^64
 124         C r11   d, shifted down
 125
 126         mul     %r11                    C carry limb in rdx     0 10
 127         mov     -8(%rsi,%r8,8), %rax    C
 128         mov     (%rsi,%r8,8), %r9       C
 129         shrd    R8(%rcx), %r9, %rax     C
 130         nop                             C
 131         sub     %rbx, %rax              C apply carry bit
 132         setc    %bl                     C
 133         sub     %rdx, %rax              C apply carry limb      5
 134         adc     $0, %rbx                C                       6
 135 L(ent): imul    %r10, %rax              C                       6
 136         mov     %rax, (%rdi,%r8,8)      C
 137         inc     %r8                     C
 138         jnz     L(top)
 139
 140         mul     %r11                    C carry limb in rdx
 141         mov     -8(%rsi), %rax          C up high limb
 142         shr     R8(%rcx), %rax
 143         sub     %rbx, %rax              C apply carry bit
 144         sub     %rdx, %rax              C apply carry limb
 145         imul    %r10, %rax
 146         mov     %rax, (%rdi)
 147         pop     %rbx
 148         FUNC_EXIT()
 149         ret
 150
 151 L(one): shr     R8(%rcx), %rax
 152         imul    %r10, %rax
 153         mov     %rax, (%rdi)
 154         pop     %rbx
 155         FUNC_EXIT()
 156         ret
 157
 158 EPILOGUE()