source/libs/gmp/gmp-src/mpn/x86_64/nano/dive_1.asm

   1 dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C            cycles/limb
  35 C              norm            unorm
  36 C AMD K8,K9     11              11
  37 C AMD K10       11              11
  38 C Intel P4       ?
  39 C Intel core2   13.5            13.25
  40 C Intel corei   14.25
  41 C Intel atom    34              36
  42 C VIA nano      19.25           19.25
  43
  44
  45 C INPUT PARAMETERS
  46 C rp            rdi
  47 C up            rsi
  48 C n             rdx
  49 C divisor       rcx
  50
  51 ABI_SUPPORT(DOS64)
  52 ABI_SUPPORT(STD64)
  53
  54 ASM_START()
  55         TEXT
  56         ALIGN(16)
  57 PROLOGUE(mpn_divexact_1)
  58         FUNC_ENTRY(4)
  59         push    %rbx
  60
  61         mov     %rcx, %rax
  62         xor     R32(%rcx), R32(%rcx)    C shift count
  63         mov     %rdx, %r8
  64
  65         bt      $0, R32(%rax)
  66         jc      L(odd)                  C skip bsfq unless divisor is even
  67         bsf     %rax, %rcx
  68         shr     R8(%rcx), %rax
  69 L(odd): mov     %rax, %rbx
  70         shr     R32(%rax)
  71         and     $127, R32(%rax)         C d/2, 7 bits
  72
  73         LEA(    binvert_limb_table, %rdx)
  74
  75         movzbl  (%rdx,%rax), R32(%rax)  C inv 8 bits
  76
  77         mov     %rbx, %r11              C d without twos
  78
  79         lea     (%rax,%rax), R32(%rdx)  C 2*inv
  80         imul    R32(%rax), R32(%rax)    C inv*inv
  81         imul    R32(%rbx), R32(%rax)    C inv*inv*d
  82         sub     R32(%rax), R32(%rdx)    C inv = 2*inv - inv*inv*d, 16 bits
  83
  84         lea     (%rdx,%rdx), R32(%rax)  C 2*inv
  85         imul    R32(%rdx), R32(%rdx)    C inv*inv
  86         imul    R32(%rbx), R32(%rdx)    C inv*inv*d
  87         sub     R32(%rdx), R32(%rax)    C inv = 2*inv - inv*inv*d, 32 bits
  88
  89         lea     (%rax,%rax), %r10       C 2*inv
  90         imul    %rax, %rax              C inv*inv
  91         imul    %rbx, %rax              C inv*inv*d
  92         sub     %rax, %r10              C inv = 2*inv - inv*inv*d, 64 bits
  93
  94         lea     (%rsi,%r8,8), %rsi      C up end
  95         lea     -8(%rdi,%r8,8), %rdi    C rp end
  96         neg     %r8                     C -n
  97
  98         mov     (%rsi,%r8,8), %rax      C up[0]
  99
 100         inc     %r8
 101         jz      L(one)
 102
 103         test    R32(%rcx), R32(%rcx)
 104         jnz     L(unorm)                C branch if count != 0
 105         xor     R32(%rbx), R32(%rbx)
 106         jmp     L(nent)
 107
 108         ALIGN(8)
 109 L(ntop):mul     %r11                    C carry limb in rdx     0 10
 110         mov     -8(%rsi,%r8,8), %rax    C
 111         sub     %rbx, %rax              C apply carry bit
 112         setc    %bl                     C
 113         sub     %rdx, %rax              C apply carry limb      5
 114         adc     $0, %rbx                C                       6
 115 L(nent):imul    %r10, %rax              C                       6
 116         mov     %rax, (%rdi,%r8,8)      C
 117         inc     %r8                     C
 118         jnz     L(ntop)
 119
 120         mov     -8(%rsi), %r9           C up high limb
 121         jmp     L(com)
 122
 123 L(unorm):
 124         mov     (%rsi,%r8,8), %r9       C up[1]
 125         shr     R8(%rcx), %rax          C
 126         neg     R32(%rcx)
 127         shl     R8(%rcx), %r9           C
 128         neg     R32(%rcx)
 129         or      %r9, %rax
 130         xor     R32(%rbx), R32(%rbx)
 131         jmp     L(uent)
 132
 133         ALIGN(8)
 134 L(utop):mul     %r11                    C carry limb in rdx     0 10
 135         mov     (%rsi,%r8,8), %rax      C
 136         shl     R8(%rcx), %rax          C
 137         neg     R32(%rcx)
 138         or      %r9, %rax
 139         sub     %rbx, %rax              C apply carry bit
 140         setc    %bl                     C
 141         sub     %rdx, %rax              C apply carry limb      5
 142         adc     $0, %rbx                C                       6
 143 L(uent):imul    %r10, %rax              C                       6
 144         mov     (%rsi,%r8,8), %r9       C
 145         shr     R8(%rcx), %r9           C
 146         neg     R32(%rcx)
 147         mov     %rax, (%rdi,%r8,8)      C
 148         inc     %r8                     C
 149         jnz     L(utop)
 150
 151 L(com): mul     %r11                    C carry limb in rdx
 152         sub     %rbx, %r9               C apply carry bit
 153         sub     %rdx, %r9               C apply carry limb
 154         imul    %r10, %r9
 155         mov     %r9, (%rdi)
 156         pop     %rbx
 157         FUNC_EXIT()
 158         ret
 159
 160 L(one): shr     R8(%rcx), %rax
 161         imul    %r10, %rax
 162         mov     %rax, (%rdi)
 163         pop     %rbx
 164         FUNC_EXIT()
 165         ret
 166 EPILOGUE()